File size: 1,018 Bytes
8d18b7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
"""Ultra-sophisticated data pipeline for OpenThoughts-1.2M and custom datasets."""

from .openthoughts_processor import OpenThoughtsProcessor, OpenThoughtsDataset
from .advanced_tokenizer import AdvancedTokenizer, TokenizerManager
from .quality_filter import QualityFilter, filter_dataset
from .curriculum_sampler import CurriculumSampler, DifficultyAwareSampler
from .data_augmentation import DataAugmenter, augment_sample
from .preprocessing import preprocess_conversation, extract_thoughts, format_for_training
from .utils import compute_length_statistics, analyze_dataset_quality

__all__ = [
    "OpenThoughtsProcessor",
    "OpenThoughtsDataset",
    "AdvancedTokenizer",
    "TokenizerManager",
    "QualityFilter",
    "filter_dataset",
    "CurriculumSampler",
    "DifficultyAwareSampler",
    "DataAugmenter",
    "augment_sample",
    "preprocess_conversation",
    "extract_thoughts",
    "format_for_training",
    "compute_length_statistics",
    "analyze_dataset_quality",
]