|
|
|
|
|
"""
|
|
|
预训练和后训练数据集配置
|
|
|
"""
|
|
|
|
|
|
PRETRAIN_DATASETS = {
|
|
|
|
|
|
'the_pile': {
|
|
|
'type': 'text',
|
|
|
'hf_path': 'EleutherAI/pile',
|
|
|
'split': 'train',
|
|
|
'streaming': True,
|
|
|
'text_field': 'text',
|
|
|
'weight': 1.0,
|
|
|
'description': 'The Pile - 825GB diverse text corpus'
|
|
|
},
|
|
|
'c4': {
|
|
|
'type': 'text',
|
|
|
'hf_path': 'allenai/c4',
|
|
|
'config': 'en',
|
|
|
'split': 'train',
|
|
|
'streaming': True,
|
|
|
'text_field': 'text',
|
|
|
'weight': 0.5,
|
|
|
'description': 'C4 - Colossal Clean Crawled Corpus'
|
|
|
},
|
|
|
'wikipedia': {
|
|
|
'type': 'text',
|
|
|
'hf_path': 'HuggingFaceFW/fineweb-edu',
|
|
|
'config': 'sample-10BT',
|
|
|
'split': 'train',
|
|
|
'streaming': True,
|
|
|
'text_field': 'text',
|
|
|
'weight': 0.3,
|
|
|
'description': 'FineWeb Edu - High quality educational content'
|
|
|
},
|
|
|
'bookcorpus': {
|
|
|
'type': 'text',
|
|
|
'hf_path': 'HuggingFaceTB/smollm-corpus',
|
|
|
'config': 'cosmopedia-v2',
|
|
|
'split': 'train',
|
|
|
'streaming': True,
|
|
|
'text_field': 'text',
|
|
|
'weight': 0.2,
|
|
|
'description': 'Synthetic textbooks and stories'
|
|
|
},
|
|
|
|
|
|
'codeparrot': {
|
|
|
'type': 'code',
|
|
|
'hf_path': 'bigcode/the-stack-smol',
|
|
|
'config': 'default',
|
|
|
'split': 'train',
|
|
|
'streaming': True,
|
|
|
'text_field': 'content',
|
|
|
'weight': 0.3,
|
|
|
'description': 'The Stack Smol - code'
|
|
|
},
|
|
|
'the_stack': {
|
|
|
'type': 'code',
|
|
|
'hf_path': 'bigcode/the-stack-dedup',
|
|
|
'split': 'train',
|
|
|
'streaming': True,
|
|
|
'text_field': 'content',
|
|
|
'weight': 0.2,
|
|
|
'description': 'The Stack - deduplicated code'
|
|
|
},
|
|
|
|
|
|
'laion400m': {
|
|
|
'type': 'image_text',
|
|
|
'hf_path': 'laion/laion400m',
|
|
|
'split': 'train',
|
|
|
'streaming': True,
|
|
|
'image_field': 'url',
|
|
|
'text_field': 'caption',
|
|
|
'weight': 0.4,
|
|
|
'description': 'LAION-400M image-text pairs'
|
|
|
},
|
|
|
'conceptual_captions': {
|
|
|
'type': 'image_text',
|
|
|
'hf_path': 'google-research-datasets/conceptual_captions',
|
|
|
'split': 'train',
|
|
|
'streaming': False,
|
|
|
'image_field': 'image_url',
|
|
|
'text_field': 'caption',
|
|
|
'weight': 0.2,
|
|
|
'description': 'Conceptual Captions 3M'
|
|
|
},
|
|
|
}
|
|
|
|
|
|
|
|
|
POSTTRAIN_DATASETS = {
|
|
|
|
|
|
'flan_v2': {
|
|
|
'type': 'instruction',
|
|
|
'hf_path': 'Muennighoff/flan',
|
|
|
'split': 'train',
|
|
|
'streaming': True,
|
|
|
'instruction_field': 'inputs',
|
|
|
'response_field': 'targets',
|
|
|
'weight': 1.0,
|
|
|
'max_samples': 100000,
|
|
|
'description': 'FLAN v2 collection'
|
|
|
},
|
|
|
'alpaca': {
|
|
|
'type': 'instruction',
|
|
|
'hf_path': 'tatsu-lab/alpaca',
|
|
|
'split': 'train',
|
|
|
'streaming': False,
|
|
|
'instruction_field': 'instruction',
|
|
|
'input_field': 'input',
|
|
|
'response_field': 'output',
|
|
|
'weight': 0.5,
|
|
|
'description': 'Stanford Alpaca 52K'
|
|
|
},
|
|
|
'dolly': {
|
|
|
'type': 'instruction',
|
|
|
'hf_path': 'databricks/databricks-dolly-15k',
|
|
|
'split': 'train',
|
|
|
'streaming': False,
|
|
|
'instruction_field': 'instruction',
|
|
|
'context_field': 'context',
|
|
|
'response_field': 'response',
|
|
|
'weight': 0.3,
|
|
|
'description': 'Dolly 15K'
|
|
|
},
|
|
|
'oasst1': {
|
|
|
'type': 'conversation',
|
|
|
'hf_path': 'OpenAssistant/oasst1',
|
|
|
'split': 'train',
|
|
|
'streaming': False,
|
|
|
'weight': 0.4,
|
|
|
'description': 'OpenAssistant Conversations',
|
|
|
|
|
|
|
|
|
},
|
|
|
'sharegpt': {
|
|
|
'type': 'conversation',
|
|
|
'hf_path': 'anon8231489123/ShareGPT_Vicuna_unfiltered',
|
|
|
'split': 'train',
|
|
|
'streaming': False,
|
|
|
'weight': 0.3,
|
|
|
'max_samples': 50000,
|
|
|
'description': 'ShareGPT conversations'
|
|
|
},
|
|
|
|
|
|
'code_alpaca': {
|
|
|
'type': 'code_instruction',
|
|
|
'hf_path': 'sahil2801/CodeAlpaca-20k',
|
|
|
'split': 'train',
|
|
|
'streaming': False,
|
|
|
'instruction_field': 'instruction',
|
|
|
'response_field': 'output',
|
|
|
'weight': 0.3,
|
|
|
'description': 'Code Alpaca 20K'
|
|
|
},
|
|
|
|
|
|
'llava_instruct': {
|
|
|
'type': 'multimodal_instruction',
|
|
|
'hf_path': 'liuhaotian/LLaVA-Instruct-150K',
|
|
|
'split': 'train',
|
|
|
'streaming': False,
|
|
|
'image_field': 'image',
|
|
|
'instruction_field': 'conversations',
|
|
|
'weight': 0.5,
|
|
|
'description': 'LLaVA visual instruction tuning'
|
|
|
},
|
|
|
|
|
|
'hh_rlhf': {
|
|
|
'type': 'preference',
|
|
|
'hf_path': 'Anthropic/hh-rlhf',
|
|
|
'split': 'train',
|
|
|
'streaming': False,
|
|
|
'chosen_field': 'chosen',
|
|
|
'rejected_field': 'rejected',
|
|
|
'weight': 1.0,
|
|
|
'description': 'Anthropic HH-RLHF'
|
|
|
},
|
|
|
'ultrafeedback': {
|
|
|
'type': 'preference',
|
|
|
'hf_path': 'openbmb/UltraFeedback',
|
|
|
'split': 'train',
|
|
|
'streaming': True,
|
|
|
'chosen_field': 'chosen',
|
|
|
'rejected_field': 'rejected',
|
|
|
'weight': 0.5,
|
|
|
'max_samples': 50000,
|
|
|
'description': 'UltraFeedback preferences'
|
|
|
},
|
|
|
'debug_water': {
|
|
|
'type': 'instruction',
|
|
|
'hf_path': 'json',
|
|
|
'data_files': 'debug_water.json',
|
|
|
'split': 'train',
|
|
|
'streaming': False,
|
|
|
'instruction_field': 'instruction',
|
|
|
'response_field': 'output',
|
|
|
'weight': 1.0,
|
|
|
'description': 'Overfitting test for water'
|
|
|
},
|
|
|
}
|
|
|
|
|
|
|
|
|
TEST_DATASETS = {
|
|
|
'tiny_shakespeare': {
|
|
|
'type': 'text',
|
|
|
'hf_path': 'tiny_shakespeare',
|
|
|
'split': 'train',
|
|
|
'streaming': False,
|
|
|
'text_field': 'text',
|
|
|
'weight': 1.0,
|
|
|
'description': 'Tiny Shakespeare for testing'
|
|
|
},
|
|
|
'gsm8k': {
|
|
|
'type': 'instruction',
|
|
|
'hf_path': 'gsm8k',
|
|
|
'config': 'main',
|
|
|
'split': 'train',
|
|
|
'streaming': False,
|
|
|
'instruction_field': 'question',
|
|
|
'response_field': 'answer',
|
|
|
'weight': 1.0,
|
|
|
'description': 'GSM8K math problems'
|
|
|
},
|
|
|
}
|
|
|
|
|
|
|
|
|
PRETRAIN_MIX = {
|
|
|
'default': {
|
|
|
'datasets': ['c4', 'wikipedia', 'bookcorpus', 'codeparrot'],
|
|
|
'weights': [0.5, 0.2, 0.2, 0.1],
|
|
|
'description': 'Default pretrain mix'
|
|
|
},
|
|
|
'code_heavy': {
|
|
|
'datasets': ['c4', 'codeparrot', 'the_stack', 'wikipedia'],
|
|
|
'weights': [0.3, 0.4, 0.2, 0.1],
|
|
|
'description': 'Code-heavy mix'
|
|
|
},
|
|
|
'multimodal': {
|
|
|
'datasets': ['c4', 'wikipedia', 'laion400m', 'conceptual_captions'],
|
|
|
'weights': [0.4, 0.2, 0.3, 0.1],
|
|
|
'description': 'Multimodal mix'
|
|
|
},
|
|
|
'text_only': {
|
|
|
'datasets': ['c4', 'wikipedia', 'bookcorpus'],
|
|
|
'weights': [0.5, 0.3, 0.2],
|
|
|
'description': 'Text-only mix for testing'
|
|
|
},
|
|
|
}
|
|
|
|
|
|
POSTTRAIN_MIX = {
|
|
|
'default': {
|
|
|
'datasets': ['flan_v2', 'alpaca', 'dolly', 'oasst1'],
|
|
|
'weights': [0.4, 0.3, 0.2, 0.1],
|
|
|
'description': 'Default instruction tuning mix'
|
|
|
},
|
|
|
'conversation': {
|
|
|
'datasets': ['oasst1', 'sharegpt', 'alpaca'],
|
|
|
'weights': [0.4, 0.4, 0.2],
|
|
|
'description': 'Conversation-focused mix'
|
|
|
},
|
|
|
'code_instruct': {
|
|
|
'datasets': ['code_alpaca', 'alpaca', 'flan_v2'],
|
|
|
'weights': [0.5, 0.3, 0.2],
|
|
|
'description': 'Code instruction mix'
|
|
|
},
|
|
|
'simple_instruct': {
|
|
|
'datasets': ['alpaca', 'dolly'],
|
|
|
'weights': [0.6, 0.4],
|
|
|
'description': 'Simple instruction mix for testing'
|
|
|
},
|
|
|
'debug_mix': {
|
|
|
'datasets': ['debug_water'],
|
|
|
'weights': [1.0],
|
|
|
'description': 'Debug mix for overfitting'
|
|
|
},
|
|
|
}
|
|
|
|
|
|
|
|
|
DATASET_CACHE_DIR = "./dataset_cache"
|
|
|
HF_CACHE_DIR = "./hf_cache"
|
|
|
MAX_RETRIES = 3
|
|
|
DOWNLOAD_TIMEOUT = 300
|
|
|
|
|
|
|
|
|
PREPROCESSING_CONFIG = {
|
|
|
'max_seq_length': 2048,
|
|
|
'min_seq_length': 32,
|
|
|
'num_workers': 4,
|
|
|
'batch_size': 8,
|
|
|
'shuffle_buffer_size': 10000,
|
|
|
'seed': 42,
|
|
|
} |