""" Mmap Data Loader ================ Memory-mapped binary dataset for efficient training. Loads tokenized .bin files created by tokenize_data.py. Uses mmap for low RAM overhead — OS manages page cache. """ import os import struct import numpy as np import torch import torch.distributed as dist from torch.utils.data import Dataset, DataLoader, DistributedSampler def _is_rank0() -> bool: return not dist.is_initialized() or dist.get_rank() == 0 HEADER_MAGIC = b"FREQTOK1" HEADER_SIZE = 32 def read_vocab_size(path: str) -> int: """Read vocab_size from binary data file header without loading the data.""" with open(path, "rb") as f: magic = f.read(8) assert magic == HEADER_MAGIC, f"Invalid file format: {path}" _version = struct.unpack(" 0, prefetch_factor=4 if num_workers > 0 else None, ) train_sampler = DistributedSampler(train_ds, shuffle=True) if distributed else None train_loader = DataLoader( train_ds, batch_size=batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=num_workers, pin_memory=True, drop_last=True, persistent_workers=num_workers > 0, prefetch_factor=4 if num_workers > 0 else None, ) return train_loader, val_loader