Upload Ron-110M: pretrain + summarizer + tokenizer + code

Browse files

Files changed (14) hide show

README.md +129 -0
code/__init__.py +0 -0
code/ask.py +204 -0
code/finetune_sft.py +484 -0
code/make_cnndm_sft.py +130 -0
code/model.py +189 -0
code/prepare_wikitext.py +151 -0
code/tokenizer.py +52 -0
code/train.py +630 -0
config.json +15 -0
meta.json +12 -0
pretrain.pt +3 -0
summarizer.pt +3 -0
tokenizer.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,129 @@

+---
+language:
+- en
+license: mit
+tags:
+- gpt
+- text-generation
+- summarization
+- from-scratch
+- pytorch
+library_name: pytorch
+---
+# Ron-110M
+A 110M-parameter GPT-style language model trained from scratch on a single
+RTX 3090. Pretrained on WikiText-103, then fine-tuned on CNN/DailyMail for
+extractive news summarization.
+This is a learning / research model. It is small, the tokenizer is a custom
+byte-level BPE, and it does not use the Hugging Face `transformers` model
+classes. The repo includes the original PyTorch code so you can run, fine-tune,
+or continue pretraining from these weights.
+## Files
+- `pretrain.pt` - base language model checkpoint (after WikiText-103 pretraining)
+- `summarizer.pt` - SFT checkpoint for news summarization (start from this for inference)
+- `tokenizer.json` - byte-level BPE tokenizer (32k vocab, specials: `<pad> <bos> <eos> <unk>`)
+- `meta.json` - dataset metadata (vocab size, dtype, token counts)
+- `code/model.py` - GPT model definition
+- `code/tokenizer.py` - tokenizer wrapper with ByteLevel decoder fix
+- `code/ask.py` - inference script with repetition penalty, top-p, no-repeat-ngram
+- `code/train.py` - pretraining script
+- `code/finetune_sft.py` - supervised fine-tuning script
+- `code/make_cnndm_sft.py` - CNN/DailyMail SFT data builder
+- `code/prepare_wikitext.py` - WikiText-103 tokenization + tokenizer training
+## Architecture
+```
+n_layer       = 12
+n_head        = 12
+n_embd        = 768
+block_size    = 512
+vocab_size    = 32000
+parameters    = 109.92M
+```
+## Training results
+| Stage              | Dataset        | Steps  | Final val loss |
+|--------------------|---------------|--------|----------------|
+| Pretrain           | WikiText-103  | 12,000 | 3.15           |
+| SFT (summarizer)   | CNN/DailyMail | 6,000  | 2.97           |
+## Quick start
+```bash
+# Clone this repo
+git lfs install
+git clone https://huggingface.co/endurasolution/RON-110M
+cd RON-110M
+# Install minimal deps
+pip install torch numpy tokenizers rich
+# Run inference
+python code/ask.py \
+  --checkpoint summarizer.pt \
+  --tokenizer tokenizer.json \
+  --text "A man has been arrested in Manchester after a series of break-ins at local shops. Police said the suspect was found with stolen goods. He is due to appear in court on Monday." \
+  --max_new_tokens 80 \
+  --temperature 0.4 \
+  --top_p 0.9 \
+  --repetition_penalty 1.1 \
+  --no_repeat_ngram_size 3
+```
+Expected output (paraphrased): a short news-style summary that preserves the key
+facts from the input.
+## Continue training
+To resume pretraining from `pretrain.pt`:
+```bash
+python code/train.py \
+  --resume pretrain.pt \
+  --reset_step --reset_optimizer \
+  --data_dir data/wikitext103 \
+  --out_dir runs/wikitext-gpt \
+  --preset rtx3090_8h \
+  --batch_size 16 --grad_accum 8 \
+  --max_steps 12000 \
+  --learning_rate 2e-4 --min_lr 2e-5 \
+  --warmup_steps 200 \
+  --no_gradient_checkpointing \
+  --save_optimizer
+```
+To fine-tune for a new task, prepare a JSONL file with `prompt` and `answer`
+keys, then:
+```bash
+python code/finetune_sft.py \
+  --base_checkpoint pretrain.pt \
+  --tokenizer tokenizer.json \
+  --sft_file your_data.jsonl \
+  --out_dir runs/my-finetune \
+  --max_steps 6000 \
+  --batch_size 8 --grad_accum 8 \
+  --learning_rate 5e-5 --min_lr 5e-6 \
+  --warmup_steps 200
+```
+## Limitations
+- Small (110M parameters) - knowledge is limited, hallucinations possible on
+  out-of-domain inputs.
+- Tokenizer is custom byte-level BPE - **must** be loaded with the included
+  `tokenizer.json`. Do not substitute a GPT-2 tokenizer.
+- Not compatible with `transformers.AutoModel`. Use the included `code/`.
+- SFT data was CNN/DailyMail news. The model is most reliable on news-style
+  English; expect weaker output on code, math, or conversational input.
+## License
+MIT.

code/__init__.py ADDED Viewed

File without changes

code/ask.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from __future__ import annotations
+import argparse
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+from searshorai.model import GPT, GPTConfig
+from searshorai.tokenizer import TextTokenizer
+# Must match the prompts used in make_xsum_sft.py / make_paragraph_sft.py.
+# Using the first template is the canonical choice at inference time.
+DEFAULT_PROMPT_TEMPLATE = (
+    "Read the article and write a one-sentence summary.\n\n"
+    "Article:\n{passage}\n\nSummary:\n"
+)
+def strip_compile_prefix(state_dict):
+    cleaned = {}
+    for key, value in state_dict.items():
+        if key.startswith("_orig_mod."):
+            key = key[len("_orig_mod.") :]
+        cleaned[key] = value
+    return cleaned
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Ask the paragraph-explainer model.")
+    parser.add_argument("--checkpoint", type=Path, required=True)
+    parser.add_argument("--tokenizer", type=Path, default=Path("data/wikitext103/tokenizer.json"))
+    parser.add_argument("--text", type=str, required=True, help="The passage to explain.")
+    parser.add_argument("--prompt_template", type=str, default=DEFAULT_PROMPT_TEMPLATE)
+    parser.add_argument("--max_new_tokens", type=int, default=120)
+    parser.add_argument("--temperature", type=float, default=0.7)
+    parser.add_argument("--top_k", type=int, default=40)
+    parser.add_argument("--top_p", type=float, default=0.9,
+                        help="Nucleus sampling cutoff. Set 1.0 to disable.")
+    parser.add_argument("--repetition_penalty", type=float, default=1.3,
+                        help="Penalty for re-emitting tokens already in the context. 1.0 = off.")
+    parser.add_argument("--no_repeat_ngram_size", type=int, default=3,
+                        help="Block any n-gram of this size from appearing twice. 0 = off.")
+    parser.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu"])
+    parser.add_argument("--seed", type=int, default=0)
+    return parser.parse_args()
+def banned_tokens_from_ngrams(generated: list[int], n: int) -> set[int]:
+    """
+    For no-repeat-ngram blocking: given the tokens generated so far, return
+    the set of token ids that would close a previously-seen n-gram if emitted
+    next.
+    """
+    if n <= 0 or len(generated) < n - 1:
+        return set()
+    prefix = tuple(generated[-(n - 1):])
+    banned: set[int] = set()
+    for i in range(len(generated) - n + 1):
+        ngram = tuple(generated[i : i + n - 1])
+        if ngram == prefix:
+            banned.add(generated[i + n - 1])
+    return banned
+def generate(
+    model: GPT,
+    prompt_ids: list[int],
+    max_new_tokens: int,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    repetition_penalty: float,
+    no_repeat_ngram_size: int,
+    eos_id: int | None,
+    device: str,
+) -> list[int]:
+    """
+    Custom sampling loop with repetition penalty, top-k, top-p (nucleus),
+    and no-repeat-ngram blocking. Returns the list of newly generated token
+    ids (does not include the prompt).
+    """
+    block_size = model.config.block_size
+    context = list(prompt_ids)
+    generated: list[int] = []
+    for _ in range(max_new_tokens):
+        idx_cond = context if len(context) <= block_size else context[-block_size:]
+        x = torch.tensor([idx_cond], dtype=torch.long, device=device)
+        with torch.no_grad():
+            logits, _ = model(x)
+        logits = logits[:, -1, :].squeeze(0).float()
+        if repetition_penalty != 1.0 and len(context) > 0:
+            seen = torch.tensor(list(set(context)), dtype=torch.long, device=device)
+            scores = logits[seen]
+            scores = torch.where(scores > 0, scores / repetition_penalty, scores * repetition_penalty)
+            logits[seen] = scores
+        if no_repeat_ngram_size > 0 and len(generated) >= no_repeat_ngram_size - 1:
+            banned = banned_tokens_from_ngrams(generated, no_repeat_ngram_size)
+            for tok_id in banned:
+                logits[tok_id] = -float("inf")
+        logits = logits / max(temperature, 1e-5)
+        if top_k is not None and top_k > 0:
+            k = min(top_k, logits.size(-1))
+            top_vals, _ = torch.topk(logits, k)
+            cutoff = top_vals[-1]
+            logits = torch.where(logits < cutoff, torch.full_like(logits, -float("inf")), logits)
+        if top_p < 1.0:
+            sorted_logits, sorted_idx = torch.sort(logits, descending=True)
+            probs_sorted = F.softmax(sorted_logits, dim=-1)
+            cumulative = torch.cumsum(probs_sorted, dim=-1)
+            mask = cumulative > top_p
+            mask[..., 1:] = mask[..., :-1].clone()
+            mask[..., 0] = False
+            sorted_logits = sorted_logits.masked_fill(mask, -float("inf"))
+            logits = torch.full_like(logits, -float("inf"))
+            logits.scatter_(0, sorted_idx, sorted_logits)
+        probs = F.softmax(logits, dim=-1)
+        if not torch.isfinite(probs).all() or probs.sum() <= 0:
+            next_tok = int(logits.argmax().item())
+        else:
+            next_tok = int(torch.multinomial(probs, num_samples=1).item())
+        if eos_id is not None and next_tok == eos_id:
+            break
+        context.append(next_tok)
+        generated.append(next_tok)
+    return generated
+def main() -> None:
+    args = parse_args()
+    if args.seed:
+        torch.manual_seed(args.seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(args.seed)
+    if args.device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    else:
+        device = args.device
+    tok = TextTokenizer(args.tokenizer)
+    ckpt = torch.load(args.checkpoint, map_location=device, weights_only=False)
+    config = GPTConfig(**ckpt["config"])
+    config.dropout = 0.0
+    config.gradient_checkpointing = False
+    model = GPT(config)
+    state_dict = strip_compile_prefix(ckpt["model"])
+    model.load_state_dict(state_dict, strict=True)
+    model.to(device)
+    model.eval()
+    if tok.vocab_size != model.config.vocab_size:
+        raise RuntimeError(
+            f"Tokenizer vocab_size {tok.vocab_size} != model vocab_size {model.config.vocab_size}. "
+            "Use the same tokenizer.json that was used for pretrain/SFT."
+        )
+    prompt = args.prompt_template.format(passage=args.text.strip())
+    prompt_ids = tok.encode(prompt, add_bos=True, add_eos=False)
+    max_prompt_len = model.config.block_size - args.max_new_tokens - 1
+    if max_prompt_len < 16:
+        raise RuntimeError(
+            f"max_new_tokens={args.max_new_tokens} is too large for block_size={model.config.block_size}."
+        )
+    if len(prompt_ids) > max_prompt_len:
+        bos = [prompt_ids[0]] if prompt_ids and prompt_ids[0] == tok.bos_id else []
+        tail = prompt_ids[-(max_prompt_len - len(bos)) :]
+        prompt_ids = bos + tail
+    new_ids = generate(
+        model=model,
+        prompt_ids=prompt_ids,
+        max_new_tokens=args.max_new_tokens,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        repetition_penalty=args.repetition_penalty,
+        no_repeat_ngram_size=args.no_repeat_ngram_size,
+        eos_id=tok.eos_id,
+        device=device,
+    )
+    answer = tok.decode(new_ids, skip_special_tokens=True).strip()
+    print(answer)
+if __name__ == "__main__":
+    main()

code/finetune_sft.py ADDED Viewed

	@@ -0,0 +1,484 @@

+from __future__ import annotations
+import argparse
+import json
+import math
+import random
+import time
+import unicodedata
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+import torch
+from rich.console import Console
+from torch.nn.utils.rnn import pad_sequence
+from searshorai.model import GPT, GPTConfig
+from searshorai.tokenizer import TextTokenizer
+console = Console()
+@dataclass
+class Example:
+    input_ids: list[int]
+    labels: list[int]
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Stable supervised fine-tune for paragraph explanation.")
+    parser.add_argument("--base_checkpoint", type=Path, default=Path("runs/wikitext-gpt/best.pt"))
+    parser.add_argument("--tokenizer", type=Path, default=Path("data/wikitext103/tokenizer.json"))
+    parser.add_argument("--sft_file", type=Path, default=Path("data/wikitext103/paragraph_sft.jsonl"))
+    parser.add_argument("--out_dir", type=Path, default=Path("runs/paragraph-explainer"))
+    parser.add_argument("--max_steps", type=int, default=8000)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--grad_accum", type=int, default=8)
+    parser.add_argument("--learning_rate", type=float, default=2e-5)
+    parser.add_argument("--min_lr", type=float, default=2e-6)
+    parser.add_argument("--warmup_steps", type=int, default=300)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--grad_clip", type=float, default=1.0)
+    parser.add_argument("--max_answer_tokens", type=int, default=220)
+    parser.add_argument("--min_answer_tokens", type=int, default=8)
+    parser.add_argument("--val_ratio", type=float, default=0.02)
+    parser.add_argument("--eval_interval", type=int, default=250)
+    parser.add_argument("--eval_batches", type=int, default=40)
+    parser.add_argument("--save_interval", type=int, default=500)
+    parser.add_argument("--log_interval", type=int, default=20)
+    parser.add_argument("--seed", type=int, default=1337)
+    parser.add_argument("--compile", action="store_true")
+    parser.add_argument("--resume", type=Path, default=None)
+    return parser.parse_args()
+def clean_text(text: Any) -> str:
+    if text is None:
+        return ""
+    text = str(text)
+    text = text.replace("\ufffd", " ")
+    text = unicodedata.normalize("NFKC", text)
+    text = "".join(ch if (ch in ("\n", "\t") or ord(ch) >= 32) else " " for ch in text)
+    text = "\n".join(" ".join(line.split()) for line in text.splitlines())
+    return text.strip()
+def get_special_id(tok: TextTokenizer, name: str) -> int | None:
+    value = getattr(tok, name, None)
+    return int(value) if isinstance(value, int) else None
+def ensure_eos(ids: list[int], eos_id: int | None) -> list[int]:
+    if eos_id is None:
+        return ids
+    if not ids or ids[-1] != eos_id:
+        return ids + [eos_id]
+    return ids
+def get_lr(step: int, args: argparse.Namespace) -> float:
+    if step < args.warmup_steps:
+        return args.learning_rate * (step + 1) / max(1, args.warmup_steps)
+    ratio = (step - args.warmup_steps) / max(1, args.max_steps - args.warmup_steps)
+    coeff = 0.5 * (1.0 + math.cos(math.pi * min(1.0, max(0.0, ratio))))
+    return args.min_lr + coeff * (args.learning_rate - args.min_lr)
+def read_prompt_answer(row: dict[str, Any]) -> tuple[str, str]:
+    """
+    Supports these JSONL styles:
+      {"prompt": "...", "answer": "..."}
+      {"input": "...", "output": "..."}
+      {"paragraph": "...", "explanation": "..."}
+      {"text": "...", "answer": "..."}
+    """
+    if "prompt" in row:
+        prompt = row.get("prompt", "")
+    elif "paragraph" in row:
+        prompt = f"Explain this paragraph in simple words:\n\n{row.get('paragraph', '')}\n\nExplanation:\n"
+    elif "text" in row:
+        prompt = f"Explain this paragraph in simple words:\n\n{row.get('text', '')}\n\nExplanation:\n"
+    else:
+        prompt = row.get("input", "")
+    answer = (
+        row.get("answer")
+        if row.get("answer") is not None
+        else row.get("output")
+        if row.get("output") is not None
+        else row.get("explanation", "")
+    )
+    return clean_text(prompt), clean_text(answer)
+def load_examples(path: Path, tok: TextTokenizer, block_size: int, args: argparse.Namespace) -> list[Example]:
+    if not path.exists():
+        raise FileNotFoundError(f"SFT file not found: {path}")
+    eos_id = get_special_id(tok, "eos_id")
+    examples: list[Example] = []
+    skipped_empty = 0
+    skipped_too_short = 0
+    truncated_answers = 0
+    bad_json = 0
+    with path.open("r", encoding="utf-8", errors="replace") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                row = json.loads(line)
+            except json.JSONDecodeError:
+                bad_json += 1
+                continue
+            prompt, answer = read_prompt_answer(row)
+            if not prompt or not answer:
+                skipped_empty += 1
+                continue
+            prompt_ids = tok.encode(prompt, add_bos=True, add_eos=False)
+            # Encode answer without EOS, then add EOS after any truncation.
+            answer_ids = tok.encode(answer, add_bos=False, add_eos=False)
+            if len(answer_ids) < args.min_answer_tokens:
+                skipped_too_short += 1
+                continue
+            if len(answer_ids) > args.max_answer_tokens:
+                answer_ids = answer_ids[: args.max_answer_tokens]
+                truncated_answers += 1
+            answer_ids = ensure_eos(answer_ids, eos_id)
+            # full_ids must fit in block_size + 1 (we shift to get input/target).
+            room_for_prompt = (block_size + 1) - len(answer_ids)
+            if room_for_prompt < 16:
+                # Answer is huge - cut it further but keep EOS at the end.
+                keep = max(16, block_size - 32)
+                answer_ids = answer_ids[: keep - 1]
+                answer_ids = ensure_eos(answer_ids, eos_id)
+                room_for_prompt = (block_size + 1) - len(answer_ids)
+            # Keep the tail of the prompt if it is too long.
+            if len(prompt_ids) > room_for_prompt:
+                # Preserve BOS at position 0 by keeping BOS + tail of body.
+                bos = [prompt_ids[0]] if prompt_ids and prompt_ids[0] == tok.bos_id else []
+                tail = prompt_ids[-(room_for_prompt - len(bos)) :] if room_for_prompt - len(bos) > 0 else []
+                prompt_ids = bos + tail
+            full_ids = prompt_ids + answer_ids
+            if len(full_ids) > block_size + 1:
+                # Final hard cap. If we have to cut, keep EOS as the last target token.
+                full_ids = full_ids[: block_size + 1]
+                if eos_id is not None and full_ids[-1] != eos_id:
+                    full_ids[-1] = eos_id
+            if len(full_ids) < 16:
+                skipped_too_short += 1
+                continue
+            input_ids = full_ids[:-1]
+            next_ids = full_ids[1:]
+            # Loss only on answer tokens (including the final EOS target).
+            prompt_len = len(prompt_ids)
+            labels = [
+                token_id if (position + 1) >= prompt_len else -100
+                for position, token_id in enumerate(next_ids)
+            ]
+            if any(x != -100 for x in labels):
+                examples.append(Example(input_ids=input_ids, labels=labels))
+    console.print(
+        f"Loaded {len(examples):,} examples | "
+        f"empty={skipped_empty:,}, short={skipped_too_short:,}, "
+        f"truncated_answers={truncated_answers:,}, bad_json={bad_json:,}"
+    )
+    if len(examples) < 10:
+        raise RuntimeError("Too few valid SFT examples. Check your JSONL keys and tokenizer.")
+    return examples
+def make_batch(
+    examples: list[Example],
+    batch_size: int,
+    pad_id: int,
+    device: str,
+    block_size: int,
+):
+    if len(examples) >= batch_size:
+        batch = random.sample(examples, batch_size)
+    else:
+        batch = random.choices(examples, k=batch_size)
+    xs = []
+    ys = []
+    for ex in batch:
+        ix = ex.input_ids[:block_size]
+        ly = ex.labels[:block_size]
+        xs.append(torch.tensor(ix, dtype=torch.long))
+        ys.append(torch.tensor(ly, dtype=torch.long))
+    x = pad_sequence(xs, batch_first=True, padding_value=pad_id)
+    y = pad_sequence(ys, batch_first=True, padding_value=-100)
+    if device == "cuda":
+        x = x.pin_memory().to(device, non_blocking=True)
+        y = y.pin_memory().to(device, non_blocking=True)
+    else:
+        x = x.to(device)
+        y = y.to(device)
+    return x, y
+@torch.no_grad()
+def evaluate(model, examples, args, pad_id, device, autocast_ctx, block_size) -> float:
+    model.eval()
+    losses: list[float] = []
+    for _ in range(args.eval_batches):
+        x, y = make_batch(examples, args.batch_size, pad_id, device, block_size)
+        with autocast_ctx:
+            _, loss = model(x, y)
+        if torch.isfinite(loss):
+            losses.append(float(loss.item()))
+    model.train()
+    return sum(losses) / max(1, len(losses))
+def strip_compile_prefix(state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    cleaned = {}
+    for key, value in state_dict.items():
+        if key.startswith("_orig_mod."):
+            key = key[len("_orig_mod.") :]
+        cleaned[key] = value
+    return cleaned
+def save_checkpoint(
+    path: Path,
+    model,
+    optimizer,
+    args: argparse.Namespace,
+    step: int,
+    best_val_loss: float,
+    meta: dict[str, Any],
+) -> None:
+    raw_model = model._orig_mod if hasattr(model, "_orig_mod") else model
+    meta = dict(meta or {})
+    meta.update(
+        {
+            "task": "paragraph_explainer_sft",
+            "tokenizer": str(args.tokenizer),
+            "sft_file": str(args.sft_file),
+            "important": "Prompt tokens are masked; answer is EOS-safe truncated.",
+        }
+    )
+    torch.save(
+        {
+            "model": raw_model.state_dict(),
+            "optimizer": optimizer.state_dict(),
+            "args": {k: (str(v) if isinstance(v, Path) else v) for k, v in vars(args).items()},
+            "config": vars(raw_model.config),
+            "step": step,
+            "best_val_loss": best_val_loss,
+            "meta": meta,
+        },
+        path,
+    )
+def main() -> None:
+    args = parse_args()
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+    random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed)
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    device_type = "cuda" if device == "cuda" else "cpu"
+    if device == "cuda" and torch.cuda.is_bf16_supported():
+        amp_dtype = torch.bfloat16
+        console.print("AMP dtype: bfloat16")
+    elif device == "cuda":
+        amp_dtype = torch.float16
+        console.print("AMP dtype: float16")
+    else:
+        amp_dtype = torch.float32
+        console.print("AMP disabled on CPU")
+    autocast_ctx = torch.amp.autocast(
+        device_type=device_type,
+        dtype=amp_dtype,
+        enabled=(device == "cuda"),
+    )
+    tok = TextTokenizer(args.tokenizer)
+    pad_id = int(getattr(tok, "pad_id", 0))
+    if args.resume is not None:
+        ckpt_path = args.resume
+        console.print(f"Resuming SFT checkpoint: {ckpt_path}")
+    else:
+        ckpt_path = args.base_checkpoint
+        console.print(f"Starting from base checkpoint: {ckpt_path}")
+    ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
+    config = GPTConfig(**ckpt["config"])
+    # Force disable dropout for stable SFT (already 0.0 in pretrain).
+    config.dropout = 0.0
+    model = GPT(config)
+    state_dict = strip_compile_prefix(ckpt["model"])
+    model.load_state_dict(state_dict, strict=True)
+    model.to(device)
+    # Sanity check: tokenizer and model vocab must match.
+    if tok.vocab_size != model.config.vocab_size:
+        raise RuntimeError(
+            f"Tokenizer vocab_size {tok.vocab_size} != model vocab_size {model.config.vocab_size}. "
+            "This is the most common cause of garbled output. Use the same tokenizer that produced the pretrain data."
+        )
+    optimizer = model.configure_optimizers(
+        args.weight_decay,
+        args.learning_rate,
+        (0.9, 0.95),
+        device_type,
+    )
+    start_step = 0
+    best_val_loss = float("inf")
+    if args.resume is not None and "optimizer" in ckpt:
+        try:
+            optimizer.load_state_dict(ckpt["optimizer"])
+            start_step = int(ckpt.get("step", 0)) + 1
+            best_val_loss = float(ckpt.get("best_val_loss", float("inf")))
+            console.print(f"Resume from step {start_step}, previous best val {best_val_loss:.4f}")
+        except Exception as exc:
+            console.print(f"[yellow]Could not load optimizer state, starting fresh: {exc}[/yellow]")
+    try:
+        scaler = torch.amp.GradScaler("cuda", enabled=(device == "cuda" and amp_dtype == torch.float16))
+    except TypeError:
+        scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda" and amp_dtype == torch.float16))
+    examples = load_examples(args.sft_file, tok, model.config.block_size, args)
+    random.shuffle(examples)
+    val_size = max(1, int(len(examples) * args.val_ratio))
+    val_examples = examples[:val_size]
+    train_examples = examples[val_size:]
+    if not train_examples:
+        raise RuntimeError("No training examples after split.")
+    console.print(
+        f"Train={len(train_examples):,} | Val={len(val_examples):,} | "
+        f"Block size={model.config.block_size} | Device={device}"
+    )
+    if args.compile:
+        console.print("Compiling model with torch.compile...")
+        model = torch.compile(model)
+    model.train()
+    block_size = model.config.block_size if not hasattr(model, "_orig_mod") else model._orig_mod.config.block_size
+    last_time = time.time()
+    last_step = start_step
+    for step in range(start_step, args.max_steps + 1):
+        lr = get_lr(step, args)
+        for group in optimizer.param_groups:
+            group["lr"] = lr
+        optimizer.zero_grad(set_to_none=True)
+        loss_accum = 0.0
+        ok_micro_steps = 0
+        for _ in range(args.grad_accum):
+            x, y = make_batch(train_examples, args.batch_size, pad_id, device, block_size)
+            with autocast_ctx:
+                _, loss = model(x, y)
+                loss = loss / args.grad_accum
+            if not torch.isfinite(loss):
+                console.print(f"[yellow]Skipping non-finite loss at step {step}[/yellow]")
+                continue
+            scaler.scale(loss).backward()
+            loss_accum += float(loss.item())
+            ok_micro_steps += 1
+        if ok_micro_steps == 0:
+            scaler.update()
+            continue
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+        scaler.step(optimizer)
+        scaler.update()
+        if step % args.log_interval == 0:
+            now = time.time()
+            steps_done = max(1, step - last_step)
+            console.print(
+                f"step {step:6d} | loss {loss_accum:.4f} | "
+                f"lr {lr:.2e} | {(now - last_time) / steps_done:.2f}s/step"
+            )
+            last_time = now
+            last_step = step
+        if step > 0 and (step % args.eval_interval == 0 or step == args.max_steps):
+            val_loss = evaluate(model, val_examples, args, pad_id, device, autocast_ctx, block_size)
+            console.print(f"eval step {step}: val {val_loss:.4f}")
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                save_checkpoint(
+                    args.out_dir / "best.pt",
+                    model,
+                    optimizer,
+                    args,
+                    step,
+                    best_val_loss,
+                    ckpt.get("meta", {}),
+                )
+                console.print(f"[green]saved best checkpoint: {best_val_loss:.4f}[/green]")
+        if step > 0 and step % args.save_interval == 0:
+            save_checkpoint(
+                args.out_dir / "latest.pt",
+                model,
+                optimizer,
+                args,
+                step,
+                best_val_loss,
+                ckpt.get("meta", {}),
+            )
+    save_checkpoint(
+        args.out_dir / "latest.pt",
+        model,
+        optimizer,
+        args,
+        args.max_steps,
+        best_val_loss,
+        ckpt.get("meta", {}),
+    )
+    console.print("Fine-tuning complete.")
+if __name__ == "__main__":
+    main()

code/make_cnndm_sft.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from __future__ import annotations
+import argparse
+import json
+import random
+import re
+import unicodedata
+from pathlib import Path
+from datasets import load_dataset
+from tqdm import tqdm
+_WS = re.compile(r"\s+")
+_BAD_CHARS = re.compile(r"[\u0000-\u001f]")
+_REFS = re.compile(r"\[\s*\d+\s*\]")
+# CNN/DailyMail articles often start with "(CNN) -- " or "By . SOMEBODY . PUBLISHED:"
+_CNN_PREFIX = re.compile(r"^\s*\(CNN\)\s*--\s*", re.IGNORECASE)
+_BYLINE = re.compile(r"^\s*By\s+\.\s+.*?PUBLISHED:.*?\s*\.\s*", re.IGNORECASE | re.DOTALL)
+PROMPT_TEMPLATES = [
+    "Read the article and write a short summary.\n\nArticle:\n{passage}\n\nSummary:\n",
+    "Summarize the following article in a few sentences.\n\nArticle:\n{passage}\n\nShort summary:\n",
+    "Below is a news article. Give a concise summary using key facts from the article.\n\nArticle:\n{passage}\n\nSummary:\n",
+    "Provide a short summary of the article below.\n\nArticle:\n{passage}\n\nSummary:\n",
+]
+def normalize(text: str) -> str:
+    if text is None:
+        return ""
+    text = str(text)
+    text = text.replace("\ufffd", " ")
+    text = unicodedata.normalize("NFKC", text)
+    text = _BAD_CHARS.sub(" ", text)
+    text = _REFS.sub("", text)
+    text = _CNN_PREFIX.sub("", text)
+    text = _BYLINE.sub("", text)
+    text = _WS.sub(" ", text).strip()
+    return text
+def join_highlights(highlights: str) -> str:
+    """
+    CNN/DailyMail highlights come as several short lines joined by newlines.
+    We join them into a single multi-sentence string with periods.
+    """
+    if not highlights:
+        return ""
+    pieces = [p.strip() for p in highlights.split("\n") if p.strip()]
+    # Make sure each piece ends with terminal punctuation.
+    fixed = []
+    for p in pieces:
+        if p[-1] not in ".!?":
+            p = p + "."
+        fixed.append(p)
+    return " ".join(fixed)
+def is_good_pair(article: str, summary: str, min_article_chars: int, max_article_chars: int,
+                 min_summary_chars: int, max_summary_chars: int) -> bool:
+    if not article or not summary:
+        return False
+    if not (min_article_chars <= len(article) <= max_article_chars):
+        return False
+    if not (min_summary_chars <= len(summary) <= max_summary_chars):
+        return False
+    # Reject if the summary is basically the whole article (rare here but safe).
+    if len(summary) >= 0.8 * len(article):
+        return False
+    # Must be mostly letters.
+    letters = sum(ch.isalpha() for ch in article)
+    if letters / max(1, len(article)) < 0.6:
+        return False
+    return True
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Build an SFT set from CNN/DailyMail (near-extractive summaries)."
+    )
+    parser.add_argument("--out_file", type=Path, default=Path("data/wikitext103/paragraph_sft.jsonl"))
+    parser.add_argument("--dataset", type=str, default="abisee/cnn_dailymail")
+    parser.add_argument("--config", type=str, default="3.0.0")
+    parser.add_argument("--max_examples", type=int, default=100000)
+    parser.add_argument("--min_article_chars", type=int, default=400)
+    parser.add_argument("--max_article_chars", type=int, default=2200)
+    parser.add_argument("--min_summary_chars", type=int, default=80)
+    parser.add_argument("--max_summary_chars", type=int, default=400)
+    parser.add_argument("--seed", type=int, default=1337)
+    args = parser.parse_args()
+    args.out_file.parent.mkdir(parents=True, exist_ok=True)
+    rng = random.Random(args.seed)
+    print(f"Loading {args.dataset} ({args.config})...")
+    dataset = load_dataset(args.dataset, args.config, split="train")
+    count = 0
+    skipped = 0
+    with args.out_file.open("w", encoding="utf-8") as f:
+        for row in tqdm(dataset, desc="building SFT"):
+            article = normalize(row.get("article", ""))
+            summary = join_highlights(normalize(row.get("highlights", "")))
+            if not is_good_pair(
+                article, summary,
+                args.min_article_chars, args.max_article_chars,
+                args.min_summary_chars, args.max_summary_chars,
+            ):
+                skipped += 1
+                continue
+            if len(article) > args.max_article_chars:
+                article = article[: args.max_article_chars].rsplit(" ", 1)[0]
+            template = rng.choice(PROMPT_TEMPLATES)
+            prompt = template.format(passage=article)
+            f.write(json.dumps({"prompt": prompt, "answer": summary}, ensure_ascii=False) + "\n")
+            count += 1
+            if count >= args.max_examples:
+                break
+    print(f"Wrote {count:,} examples to {args.out_file} (skipped={skipped:,})")
+if __name__ == "__main__":
+    main()

code/model.py ADDED Viewed

	@@ -0,0 +1,189 @@

+from __future__ import annotations
+import inspect
+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import functional as F
+@dataclass
+class GPTConfig:
+    vocab_size: int
+    block_size: int = 512
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    dropout: float = 0.0
+    bias: bool = False
+    gradient_checkpointing: bool = False
+class LayerNorm(nn.Module):
+    def __init__(self, ndim: int, bias: bool):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.attn_dropout = config.dropout
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch, seq_len, channels = x.size()
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        head_dim = channels // self.n_head
+        q = q.view(batch, seq_len, self.n_head, head_dim).transpose(1, 2)
+        k = k.view(batch, seq_len, self.n_head, head_dim).transpose(1, 2)
+        v = v.view(batch, seq_len, self.n_head, head_dim).transpose(1, 2)
+        y = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=None,
+            dropout_p=self.attn_dropout if self.training else 0.0,
+            is_causal=True,
+        )
+        y = y.transpose(1, 2).contiguous().view(batch, seq_len, channels)
+        return self.resid_dropout(self.c_proj(y))
+class MLP(nn.Module):
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu = nn.GELU(approximate="tanh")
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = MLP(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, config: GPTConfig):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(
+            {
+                "wte": nn.Embedding(config.vocab_size, config.n_embd),
+                "wpe": nn.Embedding(config.block_size, config.n_embd),
+                "drop": nn.Dropout(config.dropout),
+                "h": nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+                "ln_f": LayerNorm(config.n_embd, bias=config.bias),
+            }
+        )
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.transformer.wte.weight = self.lm_head.weight
+        self.apply(self._init_weights)
+        for name, param in self.named_parameters():
+            if name.endswith("c_proj.weight"):
+                torch.nn.init.normal_(param, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(
+        self, idx: torch.Tensor, targets: torch.Tensor | None = None
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        batch, seq_len = idx.size()
+        if seq_len > self.config.block_size:
+            raise ValueError(f"Sequence length {seq_len} exceeds block size {self.config.block_size}")
+        pos = torch.arange(0, seq_len, dtype=torch.long, device=idx.device)
+        x = self.transformer.drop(self.transformer.wte(idx) + self.transformer.wpe(pos))
+        for block in self.transformer.h:
+            if self.config.gradient_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(block, x, use_reentrant=False)
+            else:
+                x = block(x)
+        x = self.transformer.ln_f(x)
+        if targets is None:
+            logits = self.lm_head(x[:, [-1], :])
+            loss = None
+        else:
+            logits = self.lm_head(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
+        return logits, loss
+    @torch.no_grad()
+    def generate(
+        self,
+        idx: torch.Tensor,
+        max_new_tokens: int,
+        temperature: float = 0.8,
+        top_k: int | None = 50,
+        eos_id: int | None = None,
+    ) -> torch.Tensor:
+        for _ in range(max_new_tokens):
+            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size :]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / max(temperature, 1e-5)
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("Inf")
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+            if eos_id is not None and idx_next.item() == eos_id:
+                break
+        return idx
+    def crop_block_size(self, block_size: int) -> None:
+        assert block_size <= self.config.block_size
+        self.config.block_size = block_size
+        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
+    def configure_optimizers(
+        self, weight_decay: float, learning_rate: float, betas: tuple[float, float], device_type: str
+    ) -> torch.optim.Optimizer:
+        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
+        decay_params = [p for _, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for _, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {"params": decay_params, "weight_decay": weight_decay},
+            {"params": nodecay_params, "weight_decay": 0.0},
+        ]
+        fused_available = "fused" in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == "cuda"
+        extra_args = {"fused": True} if use_fused else {}
+        return torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+    def num_parameters(self) -> int:
+        return sum(p.numel() for p in self.parameters())

code/prepare_wikitext.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+import numpy as np
+from datasets import load_dataset
+from tokenizers import ByteLevelBPETokenizer, Tokenizer
+from tokenizers import decoders as _decoders
+from tqdm import tqdm
+SPECIAL_TOKENS = ["<pad>", "<bos>", "<eos>", "<unk>"]
+def clean_lines(dataset):
+    for row in dataset:
+        text = row["text"].strip()
+        if text:
+            yield text
+class _TokenizerAdapter:
+    """
+    Small adapter so the rest of the script can call .encode(text).ids and
+    .get_vocab() / .get_vocab_size() regardless of whether the tokenizer was
+    freshly trained (ByteLevelBPETokenizer) or reloaded from JSON (Tokenizer).
+    """
+    def __init__(self, tokenizer):
+        self._t = tokenizer
+    def encode(self, text: str):
+        return self._t.encode(text)
+    def get_vocab(self):
+        return self._t.get_vocab()
+    def get_vocab_size(self):
+        return self._t.get_vocab_size()
+def load_or_train_tokenizer(tokenizer_path: Path, train_dataset, vocab_size: int, min_frequency: int):
+    if tokenizer_path.exists():
+        print(f"Using existing tokenizer at {tokenizer_path}")
+        # Reload via the generic Tokenizer class. ByteLevelBPETokenizer does NOT
+        # accept tokenizer_file= in current tokenizers releases.
+        t = Tokenizer.from_file(str(tokenizer_path))
+        # Make sure a ByteLevel decoder is attached so downstream decoding works.
+        try:
+            current_decoder = t.decoder
+        except Exception:
+            current_decoder = None
+        if current_decoder is None:
+            t.decoder = _decoders.ByteLevel()
+        return _TokenizerAdapter(t)
+    print("Training byte-level BPE tokenizer...")
+    t = ByteLevelBPETokenizer()
+    t.train_from_iterator(
+        clean_lines(train_dataset),
+        vocab_size=vocab_size,
+        min_frequency=min_frequency,
+        special_tokens=SPECIAL_TOKENS,
+    )
+    t.save(str(tokenizer_path))
+    # Reopen via generic Tokenizer so we attach a decoder consistently.
+    reopened = Tokenizer.from_file(str(tokenizer_path))
+    try:
+        current_decoder = reopened.decoder
+    except Exception:
+        current_decoder = None
+    if current_decoder is None:
+        reopened.decoder = _decoders.ByteLevel()
+    return _TokenizerAdapter(reopened)
+def write_split(tokenizer, dataset, out_file: Path, dtype, bos_id: int, eos_id: int) -> int:
+    token_count = 0
+    with out_file.open("wb") as f:
+        for text in tqdm(clean_lines(dataset), desc=f"tokenizing {out_file.name}"):
+            ids = [bos_id] + tokenizer.encode(text).ids + [eos_id]
+            arr = np.asarray(ids, dtype=dtype)
+            arr.tofile(f)
+            token_count += len(ids)
+    return token_count
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download WikiText-103, train a tokenizer, and make binary token files."
+    )
+    parser.add_argument("--data_dir", type=Path, default=Path("data/wikitext103"))
+    parser.add_argument("--dataset", type=str, default="Salesforce/wikitext")
+    parser.add_argument("--config", type=str, default="wikitext-103-raw-v1")
+    parser.add_argument("--vocab_size", type=int, default=32000)
+    parser.add_argument("--min_frequency", type=int, default=2)
+    args = parser.parse_args()
+    args.data_dir.mkdir(parents=True, exist_ok=True)
+    tokenizer_path = args.data_dir / "tokenizer.json"
+    print("Loading WikiText-103...")
+    train = load_dataset(args.dataset, args.config, split="train")
+    val = load_dataset(args.dataset, args.config, split="validation")
+    test = load_dataset(args.dataset, args.config, split="test")
+    tokenizer = load_or_train_tokenizer(
+        tokenizer_path=tokenizer_path,
+        train_dataset=train,
+        vocab_size=args.vocab_size,
+        min_frequency=args.min_frequency,
+    )
+    vocab = tokenizer.get_vocab()
+    if "<bos>" not in vocab or "<eos>" not in vocab or "<pad>" not in vocab:
+        raise RuntimeError(
+            "Tokenizer is missing required special tokens (<pad>, <bos>, <eos>). "
+            "Delete data/wikitext103/tokenizer.json and re-run to retrain."
+        )
+    bos_id = vocab["<bos>"]
+    eos_id = vocab["<eos>"]
+    pad_id = vocab["<pad>"]
+    vocab_size = tokenizer.get_vocab_size()
+    dtype = np.uint16 if vocab_size <= np.iinfo(np.uint16).max else np.uint32
+    train_tokens = write_split(tokenizer, train, args.data_dir / "train.bin", dtype, bos_id, eos_id)
+    val_tokens = write_split(tokenizer, val, args.data_dir / "val.bin", dtype, bos_id, eos_id)
+    test_tokens = write_split(tokenizer, test, args.data_dir / "test.bin", dtype, bos_id, eos_id)
+    meta = {
+        "dataset": args.dataset,
+        "config": args.config,
+        "vocab_size": vocab_size,
+        "dtype": "uint16" if dtype == np.uint16 else "uint32",
+        "bos_id": bos_id,
+        "eos_id": eos_id,
+        "pad_id": pad_id,
+        "train_tokens": train_tokens,
+        "val_tokens": val_tokens,
+        "test_tokens": test_tokens,
+    }
+    (args.data_dir / "meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
+    print(f"Done. Wrote tokenizer and token files to {args.data_dir}")
+if __name__ == "__main__":
+    main()

code/tokenizer.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from __future__ import annotations
+from pathlib import Path
+from tokenizers import Tokenizer
+from tokenizers import decoders as _decoders
+class TextTokenizer:
+    """
+    Wrapper around tokenizers.Tokenizer that guarantees a ByteLevel decoder
+    is attached. ByteLevelBPETokenizer saves a JSON without a decoder block,
+    so reloading via Tokenizer.from_file() yields a tokenizer whose .decode()
+    returns raw byte-level tokens (Ġ, Ã¤) and replacement chars (ï¿½, �)
+    instead of proper UTF-8 text. We attach the decoder here so decode is
+    always correct.
+    """
+    def __init__(self, path: str | Path):
+        self.path = Path(path)
+        self.tokenizer = Tokenizer.from_file(str(self.path))
+        # Force a ByteLevel decoder if one is not attached.
+        try:
+            current_decoder = self.tokenizer.decoder
+        except Exception:
+            current_decoder = None
+        if current_decoder is None:
+            self.tokenizer.decoder = _decoders.ByteLevel()
+        vocab = self.tokenizer.get_vocab()
+        self.pad_id = vocab.get("<pad>", 0)
+        self.bos_id = vocab.get("<bos>", 1)
+        self.eos_id = vocab.get("<eos>", 2)
+        self.unk_id = vocab.get("<unk>", 3)
+        self.vocab_size = self.tokenizer.get_vocab_size()
+    def encode(self, text: str, add_bos: bool = False, add_eos: bool = False) -> list[int]:
+        ids = self.tokenizer.encode(text).ids
+        if add_bos:
+            ids = [self.bos_id] + ids
+        if add_eos:
+            ids = ids + [self.eos_id]
+        return ids
+    def decode(self, ids: list[int], skip_special_tokens: bool = True) -> str:
+        if skip_special_tokens:
+            specials = {self.pad_id, self.bos_id, self.eos_id, self.unk_id}
+            ids = [int(i) for i in ids if int(i) not in specials]
+        else:
+            ids = [int(i) for i in ids]
+        return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)

code/train.py ADDED Viewed

	@@ -0,0 +1,630 @@

+from __future__ import annotations
+import argparse
+import json
+import math
+import os
+import random
+import time
+from pathlib import Path
+from typing import Any
+import numpy as np
+import torch
+from rich.console import Console
+from searshorai.model import GPT, GPTConfig
+console = Console()
+PRESETS = {
+    "quick_test": dict(
+        n_layer=6,
+        n_head=6,
+        n_embd=384,
+        block_size=256,
+        batch_size=8,
+        grad_accum=8,
+        max_steps=1000,
+    ),
+    "gpu_16gb": dict(
+        n_layer=10,
+        n_head=10,
+        n_embd=640,
+        block_size=512,
+        batch_size=4,
+        grad_accum=16,
+        max_steps=20000,
+    ),
+    "rtx3090_8h": dict(
+        n_layer=12,
+        n_head=12,
+        n_embd=768,
+        block_size=512,
+        batch_size=8,
+        grad_accum=16,
+        max_steps=20000,
+    ),
+    "rtx3090_quality": dict(
+        n_layer=16,
+        n_head=16,
+        n_embd=1024,
+        block_size=512,
+        batch_size=4,
+        grad_accum=24,
+        max_steps=30000,
+    ),
+    "gpu_40gb_quality": dict(
+        n_layer=20,
+        n_head=16,
+        n_embd=1024,
+        block_size=768,
+        batch_size=4,
+        grad_accum=32,
+        max_steps=40000,
+    ),
+}
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Train a GPT-style language model from scratch.")
+    parser.add_argument("--data_dir", type=Path, default=Path("data/wikitext103"))
+    parser.add_argument("--out_dir", type=Path, default=Path("runs/wikitext-gpt"))
+    parser.add_argument("--preset", choices=PRESETS.keys(), default="gpu_16gb")
+    parser.add_argument("--resume", type=Path, default=None)
+    parser.add_argument("--reset_optimizer", action="store_true")
+    parser.add_argument("--reset_step", action="store_true",
+                        help="When resuming, restart step counter at 0 (useful when restarting a fresh schedule).")
+    parser.add_argument("--n_layer", type=int, default=None)
+    parser.add_argument("--n_head", type=int, default=None)
+    parser.add_argument("--n_embd", type=int, default=None)
+    parser.add_argument("--block_size", type=int, default=None)
+    parser.add_argument("--batch_size", type=int, default=None, help="Micro-batch size.")
+    parser.add_argument("--grad_accum", type=int, default=None)
+    parser.add_argument("--max_steps", type=int, default=None)
+    parser.add_argument("--learning_rate", type=float, default=2.5e-4)
+    parser.add_argument("--min_lr", type=float, default=2.5e-5)
+    parser.add_argument("--warmup_steps", type=int, default=1000)
+    parser.add_argument("--weight_decay", type=float, default=0.1)
+    parser.add_argument("--dropout", type=float, default=0.0)
+    parser.add_argument("--grad_clip", type=float, default=1.0)
+    parser.add_argument("--eval_interval", type=int, default=500)
+    parser.add_argument("--eval_iters", type=int, default=100)
+    parser.add_argument("--save_interval", type=int, default=1000)
+    parser.add_argument("--log_interval", type=int, default=20)
+    parser.add_argument("--seed", type=int, default=1337)
+    parser.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu"])
+    parser.add_argument("--dtype", type=str, default="auto", choices=["auto", "float32", "float16", "bfloat16"])
+    parser.add_argument("--compile", action="store_true")
+    parser.add_argument("--gradient_checkpointing", action="store_true")
+    parser.add_argument(
+        "--no_gradient_checkpointing",
+        "--no-gradient-checkpointing",
+        action="store_true",
+        help="Disable checkpointing when resuming from a checkpoint that was trained with it.",
+    )
+    parser.add_argument("--eval_only", action="store_true")
+    parser.add_argument("--always_save_checkpoint", action="store_true")
+    parser.add_argument("--save_optimizer", action="store_true")
+    return parser.parse_args()
+def apply_preset(args: argparse.Namespace) -> argparse.Namespace:
+    preset = PRESETS[args.preset]
+    for key, value in preset.items():
+        if getattr(args, key) is None:
+            setattr(args, key, value)
+    return args
+def setup_reproducibility(seed: int) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+def choose_device(args: argparse.Namespace) -> str:
+    if args.device == "auto":
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    if args.device == "cuda" and not torch.cuda.is_available():
+        raise RuntimeError("CUDA was requested, but torch.cuda.is_available() is False.")
+    return args.device
+def choose_dtype(args: argparse.Namespace, device: str) -> torch.dtype:
+    if device == "cpu":
+        return torch.float32
+    if args.dtype == "float32":
+        return torch.float32
+    if args.dtype == "float16":
+        return torch.float16
+    if args.dtype == "bfloat16":
+        if torch.cuda.is_bf16_supported():
+            return torch.bfloat16
+        console.print("[yellow]bfloat16 requested but not supported. Falling back to float16.[/yellow]")
+        return torch.float16
+    if torch.cuda.is_bf16_supported():
+        return torch.bfloat16
+    return torch.float16
+def make_autocast_context(device: str, dtype: torch.dtype):
+    enabled = device == "cuda" and dtype in (torch.float16, torch.bfloat16)
+    return torch.amp.autocast(device_type=device, dtype=dtype, enabled=enabled)
+def make_grad_scaler(device: str, dtype: torch.dtype):
+    enabled = device == "cuda" and dtype == torch.float16
+    try:
+        return torch.amp.GradScaler("cuda", enabled=enabled)
+    except TypeError:
+        return torch.cuda.amp.GradScaler(enabled=enabled)
+def get_lr(step: int, args: argparse.Namespace) -> float:
+    if step < args.warmup_steps:
+        return args.learning_rate * step / max(1, args.warmup_steps)
+    if step > args.max_steps:
+        return args.min_lr
+    decay_ratio = (step - args.warmup_steps) / max(1, args.max_steps - args.warmup_steps)
+    decay_ratio = min(1.0, max(0.0, decay_ratio))
+    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
+    return args.min_lr + coeff * (args.learning_rate - args.min_lr)
+def load_json(path: Path) -> dict[str, Any]:
+    if not path.exists():
+        raise FileNotFoundError(f"Missing required file: {path}")
+    return json.loads(path.read_text(encoding="utf-8"))
+def validate_meta(meta: dict[str, Any]) -> None:
+    required_keys = ["vocab_size", "dtype"]
+    for key in required_keys:
+        if key not in meta:
+            raise KeyError(f"meta.json is missing required key: {key}")
+    if meta["dtype"] not in ("uint16", "uint32"):
+        raise ValueError(f"Unsupported meta dtype: {meta['dtype']}. Expected uint16 or uint32.")
+    if int(meta["vocab_size"]) <= 0:
+        raise ValueError("meta.json vocab_size must be greater than zero.")
+    if meta["dtype"] == "uint16" and int(meta["vocab_size"]) > 65535:
+        raise ValueError("meta dtype is uint16 but vocab_size is greater than 65535. Use uint32 data files.")
+def load_memmap(path: Path, dtype: str) -> np.memmap:
+    if not path.exists():
+        raise FileNotFoundError(f"Missing required file: {path}")
+    np_dtype = np.uint16 if dtype == "uint16" else np.uint32
+    return np.memmap(path, dtype=np_dtype, mode="r")
+def validate_dataset(train_data: np.memmap, val_data: np.memmap, block_size: int, vocab_size: int) -> None:
+    min_required = block_size + 2
+    if len(train_data) < min_required:
+        raise ValueError(
+            f"train.bin is too small. Need at least {min_required} tokens for block_size={block_size}, "
+            f"but got {len(train_data)}."
+        )
+    if len(val_data) < min_required:
+        raise ValueError(
+            f"val.bin is too small. Need at least {min_required} tokens for block_size={block_size}, "
+            f"but got {len(val_data)}."
+        )
+    sample_count = min(10000, len(train_data))
+    sample_positions = np.linspace(0, len(train_data) - 1, sample_count, dtype=np.int64)
+    sample = np.asarray(train_data[sample_positions], dtype=np.int64)
+    max_token = int(sample.max())
+    min_token = int(sample.min())
+    if min_token < 0:
+        raise ValueError(f"Dataset contains negative token id: {min_token}")
+    if max_token >= vocab_size:
+        raise ValueError(
+            f"Dataset token id {max_token} is >= vocab_size {vocab_size}. "
+            "This usually means tokenizer/meta/train.bin mismatch."
+        )
+def get_batch(
+    data: np.memmap,
+    batch_size: int,
+    block_size: int,
+    device: str,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Fast batch loader: one vectorized gather, then a single host->device transfer.
+    The old code did batch_size python-level numpy slices per call, which was a
+    major bottleneck.
+    """
+    max_start = len(data) - block_size - 1
+    if max_start <= 0:
+        raise ValueError("Dataset is too small for the configured block_size.")
+    # Random start positions.
+    ix = np.random.randint(0, max_start, size=(batch_size,), dtype=np.int64)
+    # Allocate contiguous int64 arrays. memmap reads are cheap for sequential blocks.
+    x_np = np.empty((batch_size, block_size), dtype=np.int64)
+    y_np = np.empty((batch_size, block_size), dtype=np.int64)
+    for row, start in enumerate(ix):
+        x_np[row] = data[start : start + block_size]
+        y_np[row] = data[start + 1 : start + 1 + block_size]
+    x = torch.from_numpy(x_np)
+    y = torch.from_numpy(y_np)
+    if device == "cuda":
+        x = x.pin_memory().to(device, non_blocking=True)
+        y = y.pin_memory().to(device, non_blocking=True)
+    else:
+        x = x.to(device)
+        y = y.to(device)
+    return x, y
+@torch.no_grad()
+def estimate_loss(
+    model: GPT,
+    train_data: np.memmap,
+    val_data: np.memmap,
+    args: argparse.Namespace,
+    device: str,
+    autocast_ctx,
+) -> dict[str, float]:
+    out: dict[str, float] = {}
+    model.eval()
+    for split, data in [("train", train_data), ("val", val_data)]:
+        losses = []
+        for _ in range(args.eval_iters):
+            x, y = get_batch(data, args.batch_size, args.block_size, device)
+            with autocast_ctx:
+                _, loss = model(x, y)
+            if torch.isfinite(loss):
+                losses.append(float(loss.item()))
+        out[split] = float(sum(losses) / max(1, len(losses)))
+    model.train()
+    return out
+def unwrap_model(model: GPT) -> GPT:
+    if hasattr(model, "_orig_mod"):
+        return model._orig_mod
+    return model
+def strip_compile_prefix(state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    cleaned = {}
+    for key, value in state_dict.items():
+        if key.startswith("_orig_mod."):
+            key = key[len("_orig_mod.") :]
+        cleaned[key] = value
+    return cleaned
+def optimizer_to_device(optimizer: torch.optim.Optimizer, device: str) -> None:
+    for state in optimizer.state.values():
+        for key, value in state.items():
+            if isinstance(value, torch.Tensor):
+                state[key] = value.to(device)
+def save_checkpoint(
+    path: Path,
+    model: GPT,
+    optimizer: torch.optim.Optimizer | None,
+    args: argparse.Namespace,
+    step: int,
+    best_val_loss: float,
+    meta: dict[str, Any],
+) -> None:
+    raw_model = unwrap_model(model)
+    checkpoint: dict[str, Any] = {
+        "model": raw_model.state_dict(),
+        "args": vars(args),
+        "config": vars(raw_model.config),
+        "step": step,
+        "best_val_loss": best_val_loss,
+        "meta": meta,
+    }
+    if args.save_optimizer and optimizer is not None:
+        checkpoint["optimizer"] = optimizer.state_dict()
+    torch.save(checkpoint, path)
+def write_run_config(args: argparse.Namespace, meta: dict[str, Any], device: str, dtype: torch.dtype) -> None:
+    config_path = args.out_dir / "run_config.json"
+    payload = {
+        "args": {k: (str(v) if isinstance(v, Path) else v) for k, v in vars(args).items()},
+        "meta": meta,
+        "device": device,
+        "dtype": str(dtype),
+        "torch_version": torch.__version__,
+        "cuda_available": torch.cuda.is_available(),
+        "cuda_device_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
+    }
+    config_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+def build_model_from_checkpoint(
+    ckpt_path: Path,
+    device: str,
+    args: argparse.Namespace,
+) -> tuple[GPT, int, float, dict[str, Any]]:
+    ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
+    config = GPTConfig(**ckpt["config"])
+    if hasattr(config, "gradient_checkpointing"):
+        if args.no_gradient_checkpointing:
+            config.gradient_checkpointing = False
+        elif args.gradient_checkpointing:
+            config.gradient_checkpointing = True
+    model = GPT(config)
+    state_dict = strip_compile_prefix(ckpt["model"])
+    model.load_state_dict(state_dict, strict=True)
+    start_step = int(ckpt.get("step", 0))
+    best_val_loss = float(ckpt.get("best_val_loss", float("inf")))
+    checkpoint_meta = ckpt.get("meta", {})
+    return model, start_step, best_val_loss, checkpoint_meta
+def build_new_model(meta: dict[str, Any], args: argparse.Namespace) -> tuple[GPT, int, float]:
+    config = GPTConfig(
+        vocab_size=int(meta["vocab_size"]),
+        block_size=int(args.block_size),
+        n_layer=int(args.n_layer),
+        n_head=int(args.n_head),
+        n_embd=int(args.n_embd),
+        dropout=float(args.dropout),
+        gradient_checkpointing=bool(args.gradient_checkpointing),
+    )
+    model = GPT(config)
+    return model, 0, float("inf")
+def print_startup_info(
+    model: GPT,
+    args: argparse.Namespace,
+    device: str,
+    dtype: torch.dtype,
+    train_data: np.memmap,
+    val_data: np.memmap,
+    start_step: int,
+) -> None:
+    raw_model = unwrap_model(model)
+    tokens_per_step = args.batch_size * args.grad_accum * args.block_size
+    if hasattr(raw_model, "num_parameters"):
+        num_params = raw_model.num_parameters()
+    else:
+        num_params = sum(p.numel() for p in raw_model.parameters())
+    console.print("")
+    console.print("[bold green]Training configuration[/bold green]")
+    console.print(f"Device: {device}")
+    console.print(f"Dtype: {dtype}")
+    console.print(f"Preset: {args.preset}")
+    console.print(f"Parameters: {num_params / 1e6:.2f}M")
+    console.print(f"Layers: {args.n_layer}")
+    console.print(f"Heads: {args.n_head}")
+    console.print(f"Embedding size: {args.n_embd}")
+    console.print(f"Block size: {args.block_size}")
+    console.print(f"Batch size: {args.batch_size}")
+    console.print(f"Grad accumulation: {args.grad_accum}")
+    console.print(f"Tokens per step: {tokens_per_step:,}")
+    console.print(f"Train tokens: {len(train_data):,}")
+    console.print(f"Val tokens: {len(val_data):,}")
+    console.print(f"Start step: {start_step:,}")
+    console.print(f"Max steps: {args.max_steps:,}")
+    console.print(f"Learning rate: {args.learning_rate:.2e}")
+    console.print(f"Min LR: {args.min_lr:.2e}")
+    console.print(f"Warmup steps: {args.warmup_steps:,}")
+    console.print(f"Grad clip: {args.grad_clip}")
+    console.print("")
+def main() -> None:
+    args = apply_preset(parse_args())
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+    setup_reproducibility(args.seed)
+    device = choose_device(args)
+    dtype = choose_dtype(args, device)
+    autocast_ctx = make_autocast_context(device, dtype)
+    scaler = make_grad_scaler(device, dtype)
+    meta_path = args.data_dir / "meta.json"
+    meta = load_json(meta_path)
+    validate_meta(meta)
+    train_data = load_memmap(args.data_dir / "train.bin", meta["dtype"])
+    val_data = load_memmap(args.data_dir / "val.bin", meta["dtype"])
+    validate_dataset(
+        train_data=train_data,
+        val_data=val_data,
+        block_size=int(args.block_size),
+        vocab_size=int(meta["vocab_size"]),
+    )
+    if args.resume is not None:
+        console.print(f"[yellow]Resuming from checkpoint:[/yellow] {args.resume}")
+        model, start_step, best_val_loss, checkpoint_meta = build_model_from_checkpoint(args.resume, device, args)
+        if checkpoint_meta:
+            meta = checkpoint_meta
+    else:
+        model, start_step, best_val_loss = build_new_model(meta, args)
+    if args.reset_step:
+        start_step = 0
+        best_val_loss = float("inf")
+        console.print("[yellow]reset_step set: step counter restarted at 0.[/yellow]")
+    model.to(device)
+    optimizer = model.configure_optimizers(
+        args.weight_decay,
+        args.learning_rate,
+        (0.9, 0.95),
+        "cuda" if device == "cuda" else "cpu",
+    )
+    if args.resume is not None and not args.reset_optimizer:
+        ckpt = torch.load(args.resume, map_location=device, weights_only=False)
+        if "optimizer" in ckpt:
+            try:
+                optimizer.load_state_dict(ckpt["optimizer"])
+                optimizer_to_device(optimizer, device)
+                console.print("[green]Loaded optimizer state from checkpoint.[/green]")
+            except Exception as exc:
+                console.print(f"[yellow]Could not load optimizer state. Continuing with fresh optimizer. Error: {exc}[/yellow]")
+        else:
+            console.print("[yellow]Checkpoint has no optimizer state. Continuing with fresh optimizer.[/yellow]")
+    elif args.resume is not None and args.reset_optimizer:
+        console.print("[yellow]reset_optimizer set: starting with fresh Adam moments.[/yellow]")
+    if args.compile:
+        console.print("[cyan]Compiling model...[/cyan]")
+        model = torch.compile(model)
+    write_run_config(args, meta, device, dtype)
+    print_startup_info(model, args, device, dtype, train_data, val_data, start_step)
+    if args.eval_only:
+        losses = estimate_loss(model, train_data, val_data, args, device, autocast_ctx)
+        console.print(f"eval only: train {losses['train']:.4f}, val {losses['val']:.4f}")
+        return
+    model.train()
+    tokens_per_step = args.batch_size * args.grad_accum * args.block_size
+    start_time = time.time()
+    last_log_time = start_time
+    last_log_step = start_step
+    for completed_step in range(start_step, args.max_steps):
+        step = completed_step + 1
+        lr = get_lr(step, args)
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = lr
+        optimizer.zero_grad(set_to_none=True)
+        loss_accum = 0.0
+        skipped_micro = 0
+        for _ in range(args.grad_accum):
+            x, y = get_batch(train_data, args.batch_size, args.block_size, device)
+            with autocast_ctx:
+                _, loss = model(x, y)
+                loss = loss / args.grad_accum
+            if not torch.isfinite(loss):
+                console.print(f"[yellow]Non-finite loss at step {step}, skipping micro-batch.[/yellow]")
+                skipped_micro += 1
+                continue
+            scaler.scale(loss).backward()
+            loss_accum += float(loss.item())
+        if skipped_micro == args.grad_accum:
+            # Whole step was bad. Skip the optimizer update.
+            scaler.update()
+            continue
+        scaler.unscale_(optimizer)
+        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+        scaler.step(optimizer)
+        scaler.update()
+        if step % args.log_interval == 0 or step == start_step + 1:
+            now = time.time()
+            elapsed = max(now - last_log_time, 1e-9)
+            steps_done = max(1, step - last_log_step)
+            toks_per_sec = (tokens_per_step * steps_done) / elapsed
+            last_log_time = now
+            last_log_step = step
+            console.print(
+                f"step {step:7d} | "
+                f"loss {loss_accum:.4f} | "
+                f"lr {lr:.2e} | "
+                f"grad {float(grad_norm):.2f} | "
+                f"{toks_per_sec:,.0f} tok/s"
+            )
+        should_eval = step % args.eval_interval == 0 or step == args.max_steps
+        if should_eval:
+            losses = estimate_loss(model, train_data, val_data, args, device, autocast_ctx)
+            console.print(
+                f"[bold]eval step {step}:[/bold] "
+                f"train {losses['train']:.4f}, val {losses['val']:.4f}"
+            )
+            if losses["val"] < best_val_loss:
+                best_val_loss = losses["val"]
+                save_checkpoint(
+                    args.out_dir / "best.pt",
+                    model,
+                    optimizer,
+                    args,
+                    step,
+                    best_val_loss,
+                    meta,
+                )
+                console.print(f"[green]saved best checkpoint: val {best_val_loss:.4f}[/green]")
+            if args.always_save_checkpoint:
+                save_checkpoint(
+                    args.out_dir / f"step_{step}.pt",
+                    model,
+                    optimizer,
+                    args,
+                    step,
+                    best_val_loss,
+                    meta,
+                )
+        if step % args.save_interval == 0:
+            save_checkpoint(
+                args.out_dir / "latest.pt",
+                model,
+                optimizer,
+                args,
+                step,
+                best_val_loss,
+                meta,
+            )
+            console.print(f"[cyan]saved latest checkpoint at step {step}[/cyan]")
+    save_checkpoint(
+        args.out_dir / "latest.pt",
+        model,
+        optimizer,
+        args,
+        args.max_steps,
+        best_val_loss,
+        meta,
+    )
+    elapsed_hours = (time.time() - start_time) / 3600.0
+    console.print("")
+    console.print(f"[bold green]Finished in {elapsed_hours:.2f} hours.[/bold green]")
+    console.print(f"[bold green]Best validation loss: {best_val_loss:.4f}[/bold green]")
+    console.print(f"[bold green]Best checkpoint: {args.out_dir / 'best.pt'}[/bold green]")
+    console.print(f"[bold green]Latest checkpoint: {args.out_dir / 'latest.pt'}[/bold green]")
+if __name__ == "__main__":
+    main()

config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "vocab_size": 32000,
+  "block_size": 512,
+  "n_layer": 12,
+  "n_head": 12,
+  "n_embd": 768,
+  "dropout": 0.0,
+  "bias": false,
+  "gradient_checkpointing": false,
+  "model_type": "ron-gpt",
+  "architectures": [
+    "GPT"
+  ],
+  "torch_dtype": "float32"
+}

meta.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "dataset": "Salesforce/wikitext",
+  "config": "wikitext-103-raw-v1",
+  "vocab_size": 32000,
+  "dtype": "uint16",
+  "bos_id": 1,
+  "eos_id": 2,
+  "pad_id": 0,
+  "train_tokens": 115671965,
+  "val_tokens": 242485,
+  "test_tokens": 276246
+}

pretrain.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2b0ce466438490a11b38092ed30993111987b58f6d8e08da64c262db1e0f476
+size 1319159633

summarizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b665451d9bdb04205cafd88b0ef46a777204584cef3a037c3bd47f0598631e8
+size 1319159633

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff