llm / data.py
abersbail's picture
Upload 16 files
7fc99b0 verified
from pathlib import Path
import torch
from .tokenizer import CharTokenizer
def load_corpus(path: Path) -> str:
text = path.read_text(encoding="utf-8").strip()
if not text:
raise ValueError(f"Corpus file is empty: {path}")
return text
def build_tokenizer(corpus_path: Path) -> tuple[str, CharTokenizer]:
text = load_corpus(corpus_path)
return text, CharTokenizer(text)
def build_training_tensors(
text: str,
tokenizer: CharTokenizer,
block_size: int,
) -> tuple[torch.Tensor, torch.Tensor]:
tokens = tokenizer.encode(text)
if len(tokens) <= block_size:
raise ValueError("Corpus is too small for the configured block size.")
inputs = []
targets = []
for idx in range(len(tokens) - block_size):
chunk = tokens[idx : idx + block_size + 1]
inputs.append(chunk[:-1])
targets.append(chunk[1:])
x = torch.tensor(inputs, dtype=torch.long)
y = torch.tensor(targets, dtype=torch.long)
return x, y