Spaces:
Sleeping
Sleeping
File size: 3,067 Bytes
8125804 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | """Tiny Shakespeare dataset loader with character-level tokenization.
Standard benchmark for small LMs (Karpathy / nanoGPT).
~1MB of text, character-level vocab (~65 unique chars).
"""
import torch
import os
class CharTokenizer:
"""Simple character-level tokenizer."""
def __init__(self, text: str):
chars = sorted(set(text))
self.vocab_size = len(chars)
self.stoi = {c: i for i, c in enumerate(chars)}
self.itos = {i: c for c, i in self.stoi.items()}
def encode(self, text: str) -> list[int]:
return [self.stoi[c] for c in text]
def decode(self, ids: list[int] | torch.Tensor) -> str:
if isinstance(ids, torch.Tensor):
ids = ids.tolist()
return "".join(self.itos[i] for i in ids)
class ShakespeareDataset:
"""Tiny Shakespeare — character-level language modeling dataset.
Args:
path: path to tiny_shakespeare.txt
split: 'train' (90%) or 'val' (10%)
seq_len: context window length
device: torch device
"""
def __init__(
self,
path: str,
split: str = "train",
seq_len: int = 256,
device: str = "cpu",
):
assert split in ("train", "val")
self.seq_len = seq_len
self.device = device
with open(path, "r", encoding="utf-8") as f:
text = f.read()
self.tokenizer = CharTokenizer(text)
data = torch.tensor(self.tokenizer.encode(text), dtype=torch.long)
# 90/10 train/val split
n = int(0.9 * len(data))
self.data = data[:n] if split == "train" else data[n:]
@property
def vocab_size(self) -> int:
return self.tokenizer.vocab_size
def get_batch(self, batch_size: int) -> tuple[torch.Tensor, torch.Tensor]:
"""Sample random batch of (input, target) sequences.
Returns:
x: [B, T] input token ids
y: [B, T] target token ids (shifted by 1)
"""
max_start = len(self.data) - self.seq_len - 1
starts = torch.randint(0, max_start, (batch_size,))
x = torch.stack([self.data[s: s + self.seq_len] for s in starts])
y = torch.stack([self.data[s + 1: s + self.seq_len + 1] for s in starts])
return x.to(self.device), y.to(self.device)
def __len__(self) -> int:
return len(self.data)
def load_shakespeare(
seq_len: int = 256,
device: str = "cpu",
data_dir: str = "data_cache",
) -> tuple["ShakespeareDataset", "ShakespeareDataset"]:
"""Load train and val splits of Tiny Shakespeare.
Returns:
(train_dataset, val_dataset)
"""
path = os.path.join(data_dir, "tiny_shakespeare.txt")
if not os.path.exists(path):
raise FileNotFoundError(
f"tiny_shakespeare.txt not found at {path}. "
"Place the file in data_cache/ directory."
)
train = ShakespeareDataset(path, split="train", seq_len=seq_len, device=device)
val = ShakespeareDataset(path, split="val", seq_len=seq_len, device=device)
return train, val
|