MultiModal / data_config.py
szxllm's picture
Upload 20 files
cd66851 verified
# data_config.py
"""
预训练和后训练数据集配置
"""
PRETRAIN_DATASETS = {
# 文本数据集
'the_pile': {
'type': 'text',
'hf_path': 'EleutherAI/pile',
'split': 'train',
'streaming': True,
'text_field': 'text',
'weight': 1.0,
'description': 'The Pile - 825GB diverse text corpus'
},
'c4': {
'type': 'text',
'hf_path': 'allenai/c4',
'config': 'en',
'split': 'train',
'streaming': True,
'text_field': 'text',
'weight': 0.5,
'description': 'C4 - Colossal Clean Crawled Corpus'
},
'wikipedia': {
'type': 'text',
'hf_path': 'HuggingFaceFW/fineweb-edu',
'config': 'sample-10BT',
'split': 'train',
'streaming': True,
'text_field': 'text',
'weight': 0.3,
'description': 'FineWeb Edu - High quality educational content'
},
'bookcorpus': {
'type': 'text',
'hf_path': 'HuggingFaceTB/smollm-corpus',
'config': 'cosmopedia-v2',
'split': 'train',
'streaming': True,
'text_field': 'text',
'weight': 0.2,
'description': 'Synthetic textbooks and stories'
},
# 代码数据集
'codeparrot': {
'type': 'code',
'hf_path': 'bigcode/the-stack-smol',
'config': 'default',
'split': 'train',
'streaming': True,
'text_field': 'content',
'weight': 0.3,
'description': 'The Stack Smol - code'
},
'the_stack': {
'type': 'code',
'hf_path': 'bigcode/the-stack-dedup',
'split': 'train',
'streaming': True,
'text_field': 'content',
'weight': 0.2,
'description': 'The Stack - deduplicated code'
},
# 多模态数据集
'laion400m': {
'type': 'image_text',
'hf_path': 'laion/laion400m',
'split': 'train',
'streaming': True,
'image_field': 'url',
'text_field': 'caption',
'weight': 0.4,
'description': 'LAION-400M image-text pairs'
},
'conceptual_captions': {
'type': 'image_text',
'hf_path': 'google-research-datasets/conceptual_captions',
'split': 'train',
'streaming': False,
'image_field': 'image_url',
'text_field': 'caption',
'weight': 0.2,
'description': 'Conceptual Captions 3M'
},
}
# 后训练数据集配置(instruction tuning + alignment)
POSTTRAIN_DATASETS = {
# Instruction Tuning数据集
'flan_v2': {
'type': 'instruction',
'hf_path': 'Muennighoff/flan',
'split': 'train',
'streaming': True,
'instruction_field': 'inputs',
'response_field': 'targets',
'weight': 1.0,
'max_samples': 100000,
'description': 'FLAN v2 collection'
},
'alpaca': {
'type': 'instruction',
'hf_path': 'tatsu-lab/alpaca',
'split': 'train',
'streaming': False,
'instruction_field': 'instruction',
'input_field': 'input',
'response_field': 'output',
'weight': 0.5,
'description': 'Stanford Alpaca 52K'
},
'dolly': {
'type': 'instruction',
'hf_path': 'databricks/databricks-dolly-15k',
'split': 'train',
'streaming': False,
'instruction_field': 'instruction',
'context_field': 'context', # Dolly有context字段
'response_field': 'response',
'weight': 0.3,
'description': 'Dolly 15K'
},
'oasst1': {
'type': 'conversation',
'hf_path': 'OpenAssistant/oasst1',
'split': 'train',
'streaming': False,
'weight': 0.4,
'description': 'OpenAssistant Conversations',
# OASST1需要特殊处理,因为它是树形结构
# 可能需要自定义预处理
},
'sharegpt': {
'type': 'conversation',
'hf_path': 'anon8231489123/ShareGPT_Vicuna_unfiltered',
'split': 'train',
'streaming': False,
'weight': 0.3,
'max_samples': 50000,
'description': 'ShareGPT conversations'
},
# Code instruction数据集
'code_alpaca': {
'type': 'code_instruction',
'hf_path': 'sahil2801/CodeAlpaca-20k',
'split': 'train',
'streaming': False,
'instruction_field': 'instruction',
'response_field': 'output',
'weight': 0.3,
'description': 'Code Alpaca 20K'
},
# 多模态instruction数据集
'llava_instruct': {
'type': 'multimodal_instruction',
'hf_path': 'liuhaotian/LLaVA-Instruct-150K',
'split': 'train',
'streaming': False,
'image_field': 'image',
'instruction_field': 'conversations',
'weight': 0.5,
'description': 'LLaVA visual instruction tuning'
},
# Preference数据集 (用于RLHF)
'hh_rlhf': {
'type': 'preference',
'hf_path': 'Anthropic/hh-rlhf',
'split': 'train',
'streaming': False,
'chosen_field': 'chosen',
'rejected_field': 'rejected',
'weight': 1.0,
'description': 'Anthropic HH-RLHF'
},
'ultrafeedback': {
'type': 'preference',
'hf_path': 'openbmb/UltraFeedback',
'split': 'train',
'streaming': True,
'chosen_field': 'chosen', # 添加字段配置
'rejected_field': 'rejected',
'weight': 0.5,
'max_samples': 50000,
'description': 'UltraFeedback preferences'
},
'debug_water': {
'type': 'instruction',
'hf_path': 'json', # 使用 json 加载器
'data_files': 'debug_water.json', # 指向刚才生成的文件
'split': 'train',
'streaming': False,
'instruction_field': 'instruction',
'response_field': 'output',
'weight': 1.0,
'description': 'Overfitting test for water'
},
}
# 轻量级测试数据集(用于快速验证)
TEST_DATASETS = {
'tiny_shakespeare': {
'type': 'text',
'hf_path': 'tiny_shakespeare',
'split': 'train',
'streaming': False,
'text_field': 'text',
'weight': 1.0,
'description': 'Tiny Shakespeare for testing'
},
'gsm8k': {
'type': 'instruction',
'hf_path': 'gsm8k',
'config': 'main',
'split': 'train',
'streaming': False,
'instruction_field': 'question',
'response_field': 'answer',
'weight': 1.0,
'description': 'GSM8K math problems'
},
}
# 数据集混合策略
PRETRAIN_MIX = {
'default': {
'datasets': ['c4', 'wikipedia', 'bookcorpus', 'codeparrot'],
'weights': [0.5, 0.2, 0.2, 0.1],
'description': 'Default pretrain mix'
},
'code_heavy': {
'datasets': ['c4', 'codeparrot', 'the_stack', 'wikipedia'],
'weights': [0.3, 0.4, 0.2, 0.1],
'description': 'Code-heavy mix'
},
'multimodal': {
'datasets': ['c4', 'wikipedia', 'laion400m', 'conceptual_captions'],
'weights': [0.4, 0.2, 0.3, 0.1],
'description': 'Multimodal mix'
},
'text_only': {
'datasets': ['c4', 'wikipedia', 'bookcorpus'],
'weights': [0.5, 0.3, 0.2],
'description': 'Text-only mix for testing'
},
}
POSTTRAIN_MIX = {
'default': {
'datasets': ['flan_v2', 'alpaca', 'dolly', 'oasst1'],
'weights': [0.4, 0.3, 0.2, 0.1],
'description': 'Default instruction tuning mix'
},
'conversation': {
'datasets': ['oasst1', 'sharegpt', 'alpaca'],
'weights': [0.4, 0.4, 0.2],
'description': 'Conversation-focused mix'
},
'code_instruct': {
'datasets': ['code_alpaca', 'alpaca', 'flan_v2'],
'weights': [0.5, 0.3, 0.2],
'description': 'Code instruction mix'
},
'simple_instruct': {
'datasets': ['alpaca', 'dolly'],
'weights': [0.6, 0.4],
'description': 'Simple instruction mix for testing'
},
'debug_mix': {
'datasets': ['debug_water'],
'weights': [1.0],
'description': 'Debug mix for overfitting'
},
}
# 下载和缓存配置
DATASET_CACHE_DIR = "./dataset_cache"
HF_CACHE_DIR = "./hf_cache"
MAX_RETRIES = 3
DOWNLOAD_TIMEOUT = 300
# 数据处理配置
PREPROCESSING_CONFIG = {
'max_seq_length': 2048,
'min_seq_length': 32,
'num_workers': 4,
'batch_size': 8,
'shuffle_buffer_size': 10000,
'seed': 42,
}