MultiModal / data_config.py

Upload 20 files

cd66851 verified about 1 month ago

8.83 kB

	# data_config.py
	"""
	预训练和后训练数据集配置
	"""

	PRETRAIN_DATASETS = {
	# 文本数据集
	'the_pile': {
	'type': 'text',
	'hf_path': 'EleutherAI/pile',
	'split': 'train',
	'streaming': True,
	'text_field': 'text',
	'weight': 1.0,
	'description': 'The Pile - 825GB diverse text corpus'
	},
	'c4': {
	'type': 'text',
	'hf_path': 'allenai/c4',
	'config': 'en',
	'split': 'train',
	'streaming': True,
	'text_field': 'text',
	'weight': 0.5,
	'description': 'C4 - Colossal Clean Crawled Corpus'
	},
	'wikipedia': {
	'type': 'text',
	'hf_path': 'HuggingFaceFW/fineweb-edu',
	'config': 'sample-10BT',
	'split': 'train',
	'streaming': True,
	'text_field': 'text',
	'weight': 0.3,
	'description': 'FineWeb Edu - High quality educational content'
	},
	'bookcorpus': {
	'type': 'text',
	'hf_path': 'HuggingFaceTB/smollm-corpus',
	'config': 'cosmopedia-v2',
	'split': 'train',
	'streaming': True,
	'text_field': 'text',
	'weight': 0.2,
	'description': 'Synthetic textbooks and stories'
	},
	# 代码数据集
	'codeparrot': {
	'type': 'code',
	'hf_path': 'bigcode/the-stack-smol',
	'config': 'default',
	'split': 'train',
	'streaming': True,
	'text_field': 'content',
	'weight': 0.3,
	'description': 'The Stack Smol - code'
	},
	'the_stack': {
	'type': 'code',
	'hf_path': 'bigcode/the-stack-dedup',
	'split': 'train',
	'streaming': True,
	'text_field': 'content',
	'weight': 0.2,
	'description': 'The Stack - deduplicated code'
	},
	# 多模态数据集
	'laion400m': {
	'type': 'image_text',
	'hf_path': 'laion/laion400m',
	'split': 'train',
	'streaming': True,
	'image_field': 'url',
	'text_field': 'caption',
	'weight': 0.4,
	'description': 'LAION-400M image-text pairs'
	},
	'conceptual_captions': {
	'type': 'image_text',
	'hf_path': 'google-research-datasets/conceptual_captions',
	'split': 'train',
	'streaming': False,
	'image_field': 'image_url',
	'text_field': 'caption',
	'weight': 0.2,
	'description': 'Conceptual Captions 3M'
	},
	}

	# 后训练数据集配置（instruction tuning + alignment）
	POSTTRAIN_DATASETS = {
	# Instruction Tuning数据集
	'flan_v2': {
	'type': 'instruction',
	'hf_path': 'Muennighoff/flan',
	'split': 'train',
	'streaming': True,
	'instruction_field': 'inputs',
	'response_field': 'targets',
	'weight': 1.0,
	'max_samples': 100000,
	'description': 'FLAN v2 collection'
	},
	'alpaca': {
	'type': 'instruction',
	'hf_path': 'tatsu-lab/alpaca',
	'split': 'train',
	'streaming': False,
	'instruction_field': 'instruction',
	'input_field': 'input',
	'response_field': 'output',
	'weight': 0.5,
	'description': 'Stanford Alpaca 52K'
	},
	'dolly': {
	'type': 'instruction',
	'hf_path': 'databricks/databricks-dolly-15k',
	'split': 'train',
	'streaming': False,
	'instruction_field': 'instruction',
	'context_field': 'context', # Dolly有context字段
	'response_field': 'response',
	'weight': 0.3,
	'description': 'Dolly 15K'
	},
	'oasst1': {
	'type': 'conversation',
	'hf_path': 'OpenAssistant/oasst1',
	'split': 'train',
	'streaming': False,
	'weight': 0.4,
	'description': 'OpenAssistant Conversations',
	# OASST1需要特殊处理，因为它是树形结构
	# 可能需要自定义预处理
	},
	'sharegpt': {
	'type': 'conversation',
	'hf_path': 'anon8231489123/ShareGPT_Vicuna_unfiltered',
	'split': 'train',
	'streaming': False,
	'weight': 0.3,
	'max_samples': 50000,
	'description': 'ShareGPT conversations'
	},
	# Code instruction数据集
	'code_alpaca': {
	'type': 'code_instruction',
	'hf_path': 'sahil2801/CodeAlpaca-20k',
	'split': 'train',
	'streaming': False,
	'instruction_field': 'instruction',
	'response_field': 'output',
	'weight': 0.3,
	'description': 'Code Alpaca 20K'
	},
	# 多模态instruction数据集
	'llava_instruct': {
	'type': 'multimodal_instruction',
	'hf_path': 'liuhaotian/LLaVA-Instruct-150K',
	'split': 'train',
	'streaming': False,
	'image_field': 'image',
	'instruction_field': 'conversations',
	'weight': 0.5,
	'description': 'LLaVA visual instruction tuning'
	},
	# Preference数据集 (用于RLHF)
	'hh_rlhf': {
	'type': 'preference',
	'hf_path': 'Anthropic/hh-rlhf',
	'split': 'train',
	'streaming': False,
	'chosen_field': 'chosen',
	'rejected_field': 'rejected',
	'weight': 1.0,
	'description': 'Anthropic HH-RLHF'
	},
	'ultrafeedback': {
	'type': 'preference',
	'hf_path': 'openbmb/UltraFeedback',
	'split': 'train',
	'streaming': True,
	'chosen_field': 'chosen', # 添加字段配置
	'rejected_field': 'rejected',
	'weight': 0.5,
	'max_samples': 50000,
	'description': 'UltraFeedback preferences'
	},
	'debug_water': {
	'type': 'instruction',
	'hf_path': 'json', # 使用 json 加载器
	'data_files': 'debug_water.json', # 指向刚才生成的文件
	'split': 'train',
	'streaming': False,
	'instruction_field': 'instruction',
	'response_field': 'output',
	'weight': 1.0,
	'description': 'Overfitting test for water'
	},
	}

	# 轻量级测试数据集（用于快速验证）
	TEST_DATASETS = {
	'tiny_shakespeare': {
	'type': 'text',
	'hf_path': 'tiny_shakespeare',
	'split': 'train',
	'streaming': False,
	'text_field': 'text',
	'weight': 1.0,
	'description': 'Tiny Shakespeare for testing'
	},
	'gsm8k': {
	'type': 'instruction',
	'hf_path': 'gsm8k',
	'config': 'main',
	'split': 'train',
	'streaming': False,
	'instruction_field': 'question',
	'response_field': 'answer',
	'weight': 1.0,
	'description': 'GSM8K math problems'
	},
	}

	# 数据集混合策略
	PRETRAIN_MIX = {
	'default': {
	'datasets': ['c4', 'wikipedia', 'bookcorpus', 'codeparrot'],
	'weights': [0.5, 0.2, 0.2, 0.1],
	'description': 'Default pretrain mix'
	},
	'code_heavy': {
	'datasets': ['c4', 'codeparrot', 'the_stack', 'wikipedia'],
	'weights': [0.3, 0.4, 0.2, 0.1],
	'description': 'Code-heavy mix'
	},
	'multimodal': {
	'datasets': ['c4', 'wikipedia', 'laion400m', 'conceptual_captions'],
	'weights': [0.4, 0.2, 0.3, 0.1],
	'description': 'Multimodal mix'
	},
	'text_only': {
	'datasets': ['c4', 'wikipedia', 'bookcorpus'],
	'weights': [0.5, 0.3, 0.2],
	'description': 'Text-only mix for testing'
	},
	}

	POSTTRAIN_MIX = {
	'default': {
	'datasets': ['flan_v2', 'alpaca', 'dolly', 'oasst1'],
	'weights': [0.4, 0.3, 0.2, 0.1],
	'description': 'Default instruction tuning mix'
	},
	'conversation': {
	'datasets': ['oasst1', 'sharegpt', 'alpaca'],
	'weights': [0.4, 0.4, 0.2],
	'description': 'Conversation-focused mix'
	},
	'code_instruct': {
	'datasets': ['code_alpaca', 'alpaca', 'flan_v2'],
	'weights': [0.5, 0.3, 0.2],
	'description': 'Code instruction mix'
	},
	'simple_instruct': {
	'datasets': ['alpaca', 'dolly'],
	'weights': [0.6, 0.4],
	'description': 'Simple instruction mix for testing'
	},
	'debug_mix': {
	'datasets': ['debug_water'],
	'weights': [1.0],
	'description': 'Debug mix for overfitting'
	},
	}

	# 下载和缓存配置
	DATASET_CACHE_DIR = "./dataset_cache"
	HF_CACHE_DIR = "./hf_cache"
	MAX_RETRIES = 3
	DOWNLOAD_TIMEOUT = 300

	# 数据处理配置
	PREPROCESSING_CONFIG = {
	'max_seq_length': 2048,
	'min_seq_length': 32,
	'num_workers': 4,
	'batch_size': 8,
	'shuffle_buffer_size': 10000,
	'seed': 42,
	}