File size: 1,000 Bytes
7fc99b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from pathlib import Path

import torch

from .tokenizer import CharTokenizer


def load_corpus(path: Path) -> str:
    text = path.read_text(encoding="utf-8").strip()
    if not text:
        raise ValueError(f"Corpus file is empty: {path}")
    return text


def build_tokenizer(corpus_path: Path) -> tuple[str, CharTokenizer]:
    text = load_corpus(corpus_path)
    return text, CharTokenizer(text)


def build_training_tensors(
    text: str,
    tokenizer: CharTokenizer,
    block_size: int,
) -> tuple[torch.Tensor, torch.Tensor]:
    tokens = tokenizer.encode(text)
    if len(tokens) <= block_size:
        raise ValueError("Corpus is too small for the configured block size.")

    inputs = []
    targets = []
    for idx in range(len(tokens) - block_size):
        chunk = tokens[idx : idx + block_size + 1]
        inputs.append(chunk[:-1])
        targets.append(chunk[1:])

    x = torch.tensor(inputs, dtype=torch.long)
    y = torch.tensor(targets, dtype=torch.long)
    return x, y