| from pathlib import Path |
|
|
| import torch |
|
|
| from .tokenizer import CharTokenizer |
|
|
|
|
| def load_corpus(path: Path) -> str: |
| text = path.read_text(encoding="utf-8").strip() |
| if not text: |
| raise ValueError(f"Corpus file is empty: {path}") |
| return text |
|
|
|
|
| def build_tokenizer(corpus_path: Path) -> tuple[str, CharTokenizer]: |
| text = load_corpus(corpus_path) |
| return text, CharTokenizer(text) |
|
|
|
|
| def build_training_tensors( |
| text: str, |
| tokenizer: CharTokenizer, |
| block_size: int, |
| ) -> tuple[torch.Tensor, torch.Tensor]: |
| tokens = tokenizer.encode(text) |
| if len(tokens) <= block_size: |
| raise ValueError("Corpus is too small for the configured block size.") |
|
|
| inputs = [] |
| targets = [] |
| for idx in range(len(tokens) - block_size): |
| chunk = tokens[idx : idx + block_size + 1] |
| inputs.append(chunk[:-1]) |
| targets.append(chunk[1:]) |
|
|
| x = torch.tensor(inputs, dtype=torch.long) |
| y = torch.tensor(targets, dtype=torch.long) |
| return x, y |
|
|