# data_config.py """ 预训练和后训练数据集配置 """ PRETRAIN_DATASETS = { # 文本数据集 'the_pile': { 'type': 'text', 'hf_path': 'EleutherAI/pile', 'split': 'train', 'streaming': True, 'text_field': 'text', 'weight': 1.0, 'description': 'The Pile - 825GB diverse text corpus' }, 'c4': { 'type': 'text', 'hf_path': 'allenai/c4', 'config': 'en', 'split': 'train', 'streaming': True, 'text_field': 'text', 'weight': 0.5, 'description': 'C4 - Colossal Clean Crawled Corpus' }, 'wikipedia': { 'type': 'text', 'hf_path': 'HuggingFaceFW/fineweb-edu', 'config': 'sample-10BT', 'split': 'train', 'streaming': True, 'text_field': 'text', 'weight': 0.3, 'description': 'FineWeb Edu - High quality educational content' }, 'bookcorpus': { 'type': 'text', 'hf_path': 'HuggingFaceTB/smollm-corpus', 'config': 'cosmopedia-v2', 'split': 'train', 'streaming': True, 'text_field': 'text', 'weight': 0.2, 'description': 'Synthetic textbooks and stories' }, # 代码数据集 'codeparrot': { 'type': 'code', 'hf_path': 'bigcode/the-stack-smol', 'config': 'default', 'split': 'train', 'streaming': True, 'text_field': 'content', 'weight': 0.3, 'description': 'The Stack Smol - code' }, 'the_stack': { 'type': 'code', 'hf_path': 'bigcode/the-stack-dedup', 'split': 'train', 'streaming': True, 'text_field': 'content', 'weight': 0.2, 'description': 'The Stack - deduplicated code' }, # 多模态数据集 'laion400m': { 'type': 'image_text', 'hf_path': 'laion/laion400m', 'split': 'train', 'streaming': True, 'image_field': 'url', 'text_field': 'caption', 'weight': 0.4, 'description': 'LAION-400M image-text pairs' }, 'conceptual_captions': { 'type': 'image_text', 'hf_path': 'google-research-datasets/conceptual_captions', 'split': 'train', 'streaming': False, 'image_field': 'image_url', 'text_field': 'caption', 'weight': 0.2, 'description': 'Conceptual Captions 3M' }, } # 后训练数据集配置(instruction tuning + alignment) POSTTRAIN_DATASETS = { # Instruction Tuning数据集 'flan_v2': { 'type': 'instruction', 'hf_path': 'Muennighoff/flan', 'split': 'train', 'streaming': True, 'instruction_field': 'inputs', 'response_field': 'targets', 'weight': 1.0, 'max_samples': 100000, 'description': 'FLAN v2 collection' }, 'alpaca': { 'type': 'instruction', 'hf_path': 'tatsu-lab/alpaca', 'split': 'train', 'streaming': False, 'instruction_field': 'instruction', 'input_field': 'input', 'response_field': 'output', 'weight': 0.5, 'description': 'Stanford Alpaca 52K' }, 'dolly': { 'type': 'instruction', 'hf_path': 'databricks/databricks-dolly-15k', 'split': 'train', 'streaming': False, 'instruction_field': 'instruction', 'context_field': 'context', # Dolly有context字段 'response_field': 'response', 'weight': 0.3, 'description': 'Dolly 15K' }, 'oasst1': { 'type': 'conversation', 'hf_path': 'OpenAssistant/oasst1', 'split': 'train', 'streaming': False, 'weight': 0.4, 'description': 'OpenAssistant Conversations', # OASST1需要特殊处理,因为它是树形结构 # 可能需要自定义预处理 }, 'sharegpt': { 'type': 'conversation', 'hf_path': 'anon8231489123/ShareGPT_Vicuna_unfiltered', 'split': 'train', 'streaming': False, 'weight': 0.3, 'max_samples': 50000, 'description': 'ShareGPT conversations' }, # Code instruction数据集 'code_alpaca': { 'type': 'code_instruction', 'hf_path': 'sahil2801/CodeAlpaca-20k', 'split': 'train', 'streaming': False, 'instruction_field': 'instruction', 'response_field': 'output', 'weight': 0.3, 'description': 'Code Alpaca 20K' }, # 多模态instruction数据集 'llava_instruct': { 'type': 'multimodal_instruction', 'hf_path': 'liuhaotian/LLaVA-Instruct-150K', 'split': 'train', 'streaming': False, 'image_field': 'image', 'instruction_field': 'conversations', 'weight': 0.5, 'description': 'LLaVA visual instruction tuning' }, # Preference数据集 (用于RLHF) 'hh_rlhf': { 'type': 'preference', 'hf_path': 'Anthropic/hh-rlhf', 'split': 'train', 'streaming': False, 'chosen_field': 'chosen', 'rejected_field': 'rejected', 'weight': 1.0, 'description': 'Anthropic HH-RLHF' }, 'ultrafeedback': { 'type': 'preference', 'hf_path': 'openbmb/UltraFeedback', 'split': 'train', 'streaming': True, 'chosen_field': 'chosen', # 添加字段配置 'rejected_field': 'rejected', 'weight': 0.5, 'max_samples': 50000, 'description': 'UltraFeedback preferences' }, 'debug_water': { 'type': 'instruction', 'hf_path': 'json', # 使用 json 加载器 'data_files': 'debug_water.json', # 指向刚才生成的文件 'split': 'train', 'streaming': False, 'instruction_field': 'instruction', 'response_field': 'output', 'weight': 1.0, 'description': 'Overfitting test for water' }, } # 轻量级测试数据集(用于快速验证) TEST_DATASETS = { 'tiny_shakespeare': { 'type': 'text', 'hf_path': 'tiny_shakespeare', 'split': 'train', 'streaming': False, 'text_field': 'text', 'weight': 1.0, 'description': 'Tiny Shakespeare for testing' }, 'gsm8k': { 'type': 'instruction', 'hf_path': 'gsm8k', 'config': 'main', 'split': 'train', 'streaming': False, 'instruction_field': 'question', 'response_field': 'answer', 'weight': 1.0, 'description': 'GSM8K math problems' }, } # 数据集混合策略 PRETRAIN_MIX = { 'default': { 'datasets': ['c4', 'wikipedia', 'bookcorpus', 'codeparrot'], 'weights': [0.5, 0.2, 0.2, 0.1], 'description': 'Default pretrain mix' }, 'code_heavy': { 'datasets': ['c4', 'codeparrot', 'the_stack', 'wikipedia'], 'weights': [0.3, 0.4, 0.2, 0.1], 'description': 'Code-heavy mix' }, 'multimodal': { 'datasets': ['c4', 'wikipedia', 'laion400m', 'conceptual_captions'], 'weights': [0.4, 0.2, 0.3, 0.1], 'description': 'Multimodal mix' }, 'text_only': { 'datasets': ['c4', 'wikipedia', 'bookcorpus'], 'weights': [0.5, 0.3, 0.2], 'description': 'Text-only mix for testing' }, } POSTTRAIN_MIX = { 'default': { 'datasets': ['flan_v2', 'alpaca', 'dolly', 'oasst1'], 'weights': [0.4, 0.3, 0.2, 0.1], 'description': 'Default instruction tuning mix' }, 'conversation': { 'datasets': ['oasst1', 'sharegpt', 'alpaca'], 'weights': [0.4, 0.4, 0.2], 'description': 'Conversation-focused mix' }, 'code_instruct': { 'datasets': ['code_alpaca', 'alpaca', 'flan_v2'], 'weights': [0.5, 0.3, 0.2], 'description': 'Code instruction mix' }, 'simple_instruct': { 'datasets': ['alpaca', 'dolly'], 'weights': [0.6, 0.4], 'description': 'Simple instruction mix for testing' }, 'debug_mix': { 'datasets': ['debug_water'], 'weights': [1.0], 'description': 'Debug mix for overfitting' }, } # 下载和缓存配置 DATASET_CACHE_DIR = "./dataset_cache" HF_CACHE_DIR = "./hf_cache" MAX_RETRIES = 3 DOWNLOAD_TIMEOUT = 300 # 数据处理配置 PREPROCESSING_CONFIG = { 'max_seq_length': 2048, 'min_seq_length': 32, 'num_workers': 4, 'batch_size': 8, 'shuffle_buffer_size': 10000, 'seed': 42, }