StarMist0012's picture
Add files using upload-large-folder tool
3270dae verified
"""Dataset implementations and loaders."""
# HuggingFace-based datasets are optional for JSONL-only deployments.
try:
from .hf_base import BaseHFDataset
from .hf_pretrain import PretrainDataset
from .hf_sft import SFTDataset
from .hf_rl import RLDataset
except ImportError:
BaseHFDataset = None
PretrainDataset = None
SFTDataset = None
RLDataset = None
# JSONL-based datasets (async-only)
from .jsonl_base import BaseJSONLDataset
from .pretrain_jsonl import PretrainJSONLDataset
from .sft_jsonl import SFTJSONLDataset
from .rl_jsonl import RLJSONLDataset
# Utilities
from .tokenizer import SentencePieceTokenizerWrapper
from .sft_utils import (
parse_sft_record,
build_sft_sequence_tokens,
apply_response_masking,
build_response_only_next_token_labels,
)
from .loaders import get_dataloader
from .async_loader import AsyncBatchIterator
from .tokenization_queue import TokenizationQueue
from .factory import DatasetFactory
__all__ = [
# HuggingFace datasets
"BaseHFDataset",
"PretrainDataset",
"SFTDataset",
"RLDataset",
# JSONL datasets
"BaseJSONLDataset",
"PretrainJSONLDataset",
"SFTJSONLDataset",
"RLJSONLDataset",
# Utilities
"SentencePieceTokenizerWrapper",
"parse_sft_record",
"build_sft_sequence_tokens",
"apply_response_masking",
"build_response_only_next_token_labels",
# Data loading
"get_dataloader",
"AsyncBatchIterator",
"TokenizationQueue",
"DatasetFactory",
]