File size: 635 Bytes
27871e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# Data loading and tokenizer components

from .tokenizer import SLMTokenizer
from .dataset import (
    ConversationalDataset,
    StreamingTextDataset,
    PackedDataset,
    create_train_val_split,
    load_jsonl,
    save_jsonl,
)
from .dataloader import (
    DataModule,
    StreamingDataModule,
    create_dataloader,
    estimate_dataset_tokens,
)

__all__ = [
    "SLMTokenizer",
    "ConversationalDataset",
    "StreamingTextDataset",
    "PackedDataset",
    "create_train_val_split",
    "load_jsonl",
    "save_jsonl",
    "DataModule",
    "StreamingDataModule",
    "create_dataloader",
    "estimate_dataset_tokens",
]