"""Real data loading for WrinkleBrane training. Byte-level tokenization — each byte is a token (vocab_size=259): 0 = PAD 1 = BOS 2 = EOS 3..258 = byte 0x00..0xFF Data files are loaded, concatenated (with EOS between documents), and served as random-offset chunks for next-token prediction. """ from __future__ import annotations import os import random from pathlib import Path from typing import List, Tuple, Optional import torch from torch import Tensor # Special tokens PAD_ID = 0 BOS_ID = 1 EOS_ID = 2 BYTE_OFFSET = 3 VOCAB_SIZE = 259 # 3 special + 256 bytes def encode_bytes(text: str) -> List[int]: """Encode a string to byte-level token IDs.""" return [b + BYTE_OFFSET for b in text.encode("utf-8", errors="replace")] def decode_tokens(ids: List[int]) -> str: """Decode token IDs back to a string.""" raw = [] for i in ids: if i >= BYTE_OFFSET: raw.append(i - BYTE_OFFSET) # Skip special tokens in output return bytes(raw).decode("utf-8", errors="replace") class ByteCorpus: """Holds a tokenised corpus in a flat int32 tensor. Each document is wrapped with BOS/EOS markers. Random chunks can be drawn for training. """ def __init__(self, token_ids: Tensor): """ Parameters ---------- token_ids : Tensor [N] Flat 1-D tensor of all token IDs. """ self.data = token_ids self.length = len(token_ids) @classmethod def from_files(cls, paths: List[str], shuffle_docs: bool = True) -> "ByteCorpus": """Load and tokenise multiple text files. Documents within each file are split on ``<|endoftext|>`` markers if present, otherwise the whole file is one document. """ documents = [] for path in paths: text = Path(path).read_text(encoding="utf-8", errors="replace") # Split on endoftext markers if present if "<|endoftext|>" in text: parts = text.split("<|endoftext|>") for part in parts: part = part.strip() if part: documents.append(part) else: # Whole file is one document documents.append(text.strip()) if shuffle_docs: random.shuffle(documents) # Encode all documents with BOS/EOS framing all_ids = [] for doc in documents: all_ids.append(BOS_ID) all_ids.extend(encode_bytes(doc)) all_ids.append(EOS_ID) token_ids = torch.tensor(all_ids, dtype=torch.long) print(f" Corpus: {len(documents)} documents, " f"{len(token_ids):,} tokens ({len(token_ids)*4/1e6:.1f}MB)") return cls(token_ids) def get_batch(self, batch_size: int, seq_len: int) -> Tuple[Tensor, Tensor]: """Sample random chunks for next-token prediction. Returns ------- input_ids : Tensor [B, seq_len] target_ids : Tensor [B, seq_len] Shifted by one position. """ max_start = self.length - seq_len - 1 starts = torch.randint(0, max_start, (batch_size,)) input_ids = torch.stack([self.data[s:s + seq_len] for s in starts]) target_ids = torch.stack([self.data[s + 1:s + 1 + seq_len] for s in starts]) return input_ids, target_ids def load_train_val( data_dir: str, shuffle: bool = True, ) -> Tuple[ByteCorpus, ByteCorpus]: """Load train and validation corpora from the raw data directory. Training data: all files except tinystories_valid.txt Validation data: tinystories_valid.txt Parameters ---------- data_dir : str Path to the raw data directory. shuffle : bool Whether to shuffle documents within train. """ data_dir = Path(data_dir) train_files = [ str(data_dir / "tinystories_train.txt"), str(data_dir / "math_data.txt"), str(data_dir / "logic_reasoning.txt"), str(data_dir / "code_snippets.txt"), str(data_dir / "ascii_tables.txt"), str(data_dir / "byte_tables.txt"), ] val_files = [ str(data_dir / "tinystories_valid.txt"), ] # Only include files that exist train_files = [f for f in train_files if os.path.exists(f)] val_files = [f for f in val_files if os.path.exists(f)] print("Loading training data...") train_corpus = ByteCorpus.from_files(train_files, shuffle_docs=shuffle) print("Loading validation data...") val_corpus = ByteCorpus.from_files(val_files, shuffle_docs=False) return train_corpus, val_corpus