File size: 1,000 Bytes
7fc99b0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | from pathlib import Path
import torch
from .tokenizer import CharTokenizer
def load_corpus(path: Path) -> str:
text = path.read_text(encoding="utf-8").strip()
if not text:
raise ValueError(f"Corpus file is empty: {path}")
return text
def build_tokenizer(corpus_path: Path) -> tuple[str, CharTokenizer]:
text = load_corpus(corpus_path)
return text, CharTokenizer(text)
def build_training_tensors(
text: str,
tokenizer: CharTokenizer,
block_size: int,
) -> tuple[torch.Tensor, torch.Tensor]:
tokens = tokenizer.encode(text)
if len(tokens) <= block_size:
raise ValueError("Corpus is too small for the configured block size.")
inputs = []
targets = []
for idx in range(len(tokens) - block_size):
chunk = tokens[idx : idx + block_size + 1]
inputs.append(chunk[:-1])
targets.append(chunk[1:])
x = torch.tensor(inputs, dtype=torch.long)
y = torch.tensor(targets, dtype=torch.long)
return x, y
|