File size: 8,831 Bytes

cd66851

# data_config.py
"""

预训练和后训练数据集配置

"""

PRETRAIN_DATASETS = {
    # 文本数据集
    'the_pile': {
        'type': 'text',
        'hf_path': 'EleutherAI/pile',
        'split': 'train',
        'streaming': True,
        'text_field': 'text',
        'weight': 1.0,
        'description': 'The Pile - 825GB diverse text corpus'
    },
    'c4': {
        'type': 'text',
        'hf_path': 'allenai/c4',
        'config': 'en',
        'split': 'train',
        'streaming': True,
        'text_field': 'text',
        'weight': 0.5,
        'description': 'C4 - Colossal Clean Crawled Corpus'
    },
    'wikipedia': {
        'type': 'text',
        'hf_path': 'HuggingFaceFW/fineweb-edu',
        'config': 'sample-10BT',
        'split': 'train',
        'streaming': True,
        'text_field': 'text',
        'weight': 0.3,
        'description': 'FineWeb Edu - High quality educational content'
    },
    'bookcorpus': {
        'type': 'text',
        'hf_path': 'HuggingFaceTB/smollm-corpus',
        'config': 'cosmopedia-v2',
        'split': 'train',
        'streaming': True,
        'text_field': 'text',
        'weight': 0.2,
        'description': 'Synthetic textbooks and stories'
    },
    # 代码数据集
    'codeparrot': {
        'type': 'code',
        'hf_path': 'bigcode/the-stack-smol',
        'config': 'default',
        'split': 'train',
        'streaming': True,
        'text_field': 'content',
        'weight': 0.3,
        'description': 'The Stack Smol - code'
    },
    'the_stack': {
        'type': 'code',
        'hf_path': 'bigcode/the-stack-dedup',
        'split': 'train',
        'streaming': True,
        'text_field': 'content',
        'weight': 0.2,
        'description': 'The Stack - deduplicated code'
    },
    # 多模态数据集
    'laion400m': {
        'type': 'image_text',
        'hf_path': 'laion/laion400m',
        'split': 'train',
        'streaming': True,
        'image_field': 'url',
        'text_field': 'caption',
        'weight': 0.4,
        'description': 'LAION-400M image-text pairs'
    },
    'conceptual_captions': {
        'type': 'image_text',
        'hf_path': 'google-research-datasets/conceptual_captions',
        'split': 'train',
        'streaming': False,
        'image_field': 'image_url',
        'text_field': 'caption',
        'weight': 0.2,
        'description': 'Conceptual Captions 3M'
    },
}

# 后训练数据集配置（instruction tuning + alignment）
POSTTRAIN_DATASETS = {
    # Instruction Tuning数据集
    'flan_v2': {
        'type': 'instruction',
        'hf_path': 'Muennighoff/flan',
        'split': 'train',
        'streaming': True,
        'instruction_field': 'inputs',
        'response_field': 'targets',
        'weight': 1.0,
        'max_samples': 100000,
        'description': 'FLAN v2 collection'
    },
    'alpaca': {
        'type': 'instruction',
        'hf_path': 'tatsu-lab/alpaca',
        'split': 'train',
        'streaming': False,
        'instruction_field': 'instruction',
        'input_field': 'input',
        'response_field': 'output',
        'weight': 0.5,
        'description': 'Stanford Alpaca 52K'
    },
    'dolly': {
        'type': 'instruction',
        'hf_path': 'databricks/databricks-dolly-15k',
        'split': 'train',
        'streaming': False,
        'instruction_field': 'instruction',
        'context_field': 'context',  # Dolly有context字段
        'response_field': 'response',
        'weight': 0.3,
        'description': 'Dolly 15K'
    },
    'oasst1': {
        'type': 'conversation',
        'hf_path': 'OpenAssistant/oasst1',
        'split': 'train',
        'streaming': False,
        'weight': 0.4,
        'description': 'OpenAssistant Conversations',
        # OASST1需要特殊处理，因为它是树形结构
        # 可能需要自定义预处理
    },
    'sharegpt': {
        'type': 'conversation',
        'hf_path': 'anon8231489123/ShareGPT_Vicuna_unfiltered',
        'split': 'train',
        'streaming': False,
        'weight': 0.3,
        'max_samples': 50000,
        'description': 'ShareGPT conversations'
    },
    # Code instruction数据集
    'code_alpaca': {
        'type': 'code_instruction',
        'hf_path': 'sahil2801/CodeAlpaca-20k',
        'split': 'train',
        'streaming': False,
        'instruction_field': 'instruction',
        'response_field': 'output',
        'weight': 0.3,
        'description': 'Code Alpaca 20K'
    },
    # 多模态instruction数据集
    'llava_instruct': {
        'type': 'multimodal_instruction',
        'hf_path': 'liuhaotian/LLaVA-Instruct-150K',
        'split': 'train',
        'streaming': False,
        'image_field': 'image',
        'instruction_field': 'conversations',
        'weight': 0.5,
        'description': 'LLaVA visual instruction tuning'
    },
    # Preference数据集 (用于RLHF)
    'hh_rlhf': {
        'type': 'preference',
        'hf_path': 'Anthropic/hh-rlhf',
        'split': 'train',
        'streaming': False,
        'chosen_field': 'chosen',
        'rejected_field': 'rejected',
        'weight': 1.0,
        'description': 'Anthropic HH-RLHF'
    },
    'ultrafeedback': {
        'type': 'preference',
        'hf_path': 'openbmb/UltraFeedback',
        'split': 'train',
        'streaming': True,
        'chosen_field': 'chosen',  # 添加字段配置
        'rejected_field': 'rejected',
        'weight': 0.5,
        'max_samples': 50000,
        'description': 'UltraFeedback preferences'
    },
     'debug_water': {
        'type': 'instruction',
        'hf_path': 'json',              # 使用 json 加载器
        'data_files': 'debug_water.json', # 指向刚才生成的文件
        'split': 'train',
        'streaming': False,
        'instruction_field': 'instruction',
        'response_field': 'output',
        'weight': 1.0,
        'description': 'Overfitting test for water'
    },
}

# 轻量级测试数据集（用于快速验证）
TEST_DATASETS = {
    'tiny_shakespeare': {
        'type': 'text',
        'hf_path': 'tiny_shakespeare',
        'split': 'train',
        'streaming': False,
        'text_field': 'text',
        'weight': 1.0,
        'description': 'Tiny Shakespeare for testing'
    },
    'gsm8k': {
        'type': 'instruction',
        'hf_path': 'gsm8k',
        'config': 'main',
        'split': 'train',
        'streaming': False,
        'instruction_field': 'question',
        'response_field': 'answer',
        'weight': 1.0,
        'description': 'GSM8K math problems'
    },
}

# 数据集混合策略
PRETRAIN_MIX = {
    'default': {
        'datasets': ['c4', 'wikipedia', 'bookcorpus', 'codeparrot'],
        'weights': [0.5, 0.2, 0.2, 0.1],
        'description': 'Default pretrain mix'
    },
    'code_heavy': {
        'datasets': ['c4', 'codeparrot', 'the_stack', 'wikipedia'],
        'weights': [0.3, 0.4, 0.2, 0.1],
        'description': 'Code-heavy mix'
    },
    'multimodal': {
        'datasets': ['c4', 'wikipedia', 'laion400m', 'conceptual_captions'],
        'weights': [0.4, 0.2, 0.3, 0.1],
        'description': 'Multimodal mix'
    },
    'text_only': {
        'datasets': ['c4', 'wikipedia', 'bookcorpus'],
        'weights': [0.5, 0.3, 0.2],
        'description': 'Text-only mix for testing'
    },
}

POSTTRAIN_MIX = {
    'default': {
        'datasets': ['flan_v2', 'alpaca', 'dolly', 'oasst1'],
        'weights': [0.4, 0.3, 0.2, 0.1],
        'description': 'Default instruction tuning mix'
    },
    'conversation': {
        'datasets': ['oasst1', 'sharegpt', 'alpaca'],
        'weights': [0.4, 0.4, 0.2],
        'description': 'Conversation-focused mix'
    },
    'code_instruct': {
        'datasets': ['code_alpaca', 'alpaca', 'flan_v2'],
        'weights': [0.5, 0.3, 0.2],
        'description': 'Code instruction mix'
    },
    'simple_instruct': {
        'datasets': ['alpaca', 'dolly'],
        'weights': [0.6, 0.4],
        'description': 'Simple instruction mix for testing'
    },
    'debug_mix': {
        'datasets': ['debug_water'],
        'weights': [1.0],
        'description': 'Debug mix for overfitting'
    },
}

# 下载和缓存配置
DATASET_CACHE_DIR = "./dataset_cache"
HF_CACHE_DIR = "./hf_cache"
MAX_RETRIES = 3
DOWNLOAD_TIMEOUT = 300

# 数据处理配置
PREPROCESSING_CONFIG = {
    'max_seq_length': 2048,
    'min_seq_length': 32,
    'num_workers': 4,
    'batch_size': 8,
    'shuffle_buffer_size': 10000,
    'seed': 42,
}