File size: 1,568 Bytes
3270dae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""Dataset implementations and loaders."""

# HuggingFace-based datasets are optional for JSONL-only deployments.
try:
    from .hf_base import BaseHFDataset
    from .hf_pretrain import PretrainDataset
    from .hf_sft import SFTDataset
    from .hf_rl import RLDataset
except ImportError:
    BaseHFDataset = None
    PretrainDataset = None
    SFTDataset = None
    RLDataset = None

# JSONL-based datasets (async-only)
from .jsonl_base import BaseJSONLDataset
from .pretrain_jsonl import PretrainJSONLDataset
from .sft_jsonl import SFTJSONLDataset
from .rl_jsonl import RLJSONLDataset

# Utilities
from .tokenizer import SentencePieceTokenizerWrapper
from .sft_utils import (
    parse_sft_record,
    build_sft_sequence_tokens,
    apply_response_masking,
    build_response_only_next_token_labels,
)
from .loaders import get_dataloader
from .async_loader import AsyncBatchIterator
from .tokenization_queue import TokenizationQueue
from .factory import DatasetFactory

__all__ = [
    # HuggingFace datasets
    "BaseHFDataset",
    "PretrainDataset",
    "SFTDataset", 
    "RLDataset",
    # JSONL datasets
    "BaseJSONLDataset",
    "PretrainJSONLDataset",
    "SFTJSONLDataset",
    "RLJSONLDataset",
    # Utilities
    "SentencePieceTokenizerWrapper",
    "parse_sft_record",
    "build_sft_sequence_tokens",
    "apply_response_masking",
    "build_response_only_next_token_labels",
    # Data loading
    "get_dataloader",
    "AsyncBatchIterator",
    "TokenizationQueue",
    "DatasetFactory",
]