from pathlib import Path import torch from .tokenizer import CharTokenizer def load_corpus(path: Path) -> str: text = path.read_text(encoding="utf-8").strip() if not text: raise ValueError(f"Corpus file is empty: {path}") return text def build_tokenizer(corpus_path: Path) -> tuple[str, CharTokenizer]: text = load_corpus(corpus_path) return text, CharTokenizer(text) def build_training_tensors( text: str, tokenizer: CharTokenizer, block_size: int, ) -> tuple[torch.Tensor, torch.Tensor]: tokens = tokenizer.encode(text) if len(tokens) <= block_size: raise ValueError("Corpus is too small for the configured block size.") inputs = [] targets = [] for idx in range(len(tokens) - block_size): chunk = tokens[idx : idx + block_size + 1] inputs.append(chunk[:-1]) targets.append(chunk[1:]) x = torch.tensor(inputs, dtype=torch.long) y = torch.tensor(targets, dtype=torch.long) return x, y